Optimal Mismatch Algorithm
This algorithm works by scanning pattern characters from the least frequent one to the most frequent one. Doing so one may hope to have a mismatch most of the times and thus to scan the whole text very quickly. One needs to know the frequencies of each of the character of the alphabet.
/*****Please include following header files*****/
// stdlib.h
// string.h
/***********************************************/
typedef struct {
int Comp;
int* Indexes;
} Result;
typedef struct {
int Location;
char Character;
} Pattern;
char *_x, *_y;
int _m, _n, _qsBcLen;
int *_adaptedGs, *_qsBc, *_frequency;
Pattern* _pattern;
int Max(int a, int b)
{
return a > b ? a : b;
}
int* AddItemInArray(int* arr, int count, int item)
{
int* newArr = (int*)malloc(sizeof(int) * (count + 1));
for (int i = 0; i < count; ++i) {
newArr[i] = arr[i];
}
newArr[count] = item;
return newArr;
}
int OptimalPatternCompare(Pattern pat1, Pattern pat2, int* freq)
{
int fx = freq[pat1.Character] - freq[pat2.Character];
return (fx != 0 ? (fx > 0 ? 1 : -1) : (pat2.Location - pat1.Location));
}
void QuickSortPattern(Pattern* pat, int low, int n, int* freq)
{
int lo = low;
int hi = n;
if (lo >= n) return;
Pattern mid = pat[(lo + hi) / 2];
while (lo < hi)
{
while (lo < hi && OptimalPatternCompare(pat[lo], mid, freq) < 0) lo++;
while (lo < hi && OptimalPatternCompare(pat[hi], mid, freq) > 0) hi--;
if (lo < hi)
{
Pattern temp = pat[lo];
pat[lo] = pat[hi];
pat[hi] = temp;
}
}
if (hi < lo)
{
int temp = hi;
hi = lo;
lo = temp;
}
QuickSortPattern(pat, low, lo, freq);
QuickSortPattern(pat, lo == low ? lo + 1 : lo, n, freq);
}
void OrderPattern(char* x, Pattern* pat, int* freq)
{
int xLen = strlen(x);
for (int i = 0; i < xLen; ++i)
{
Pattern ptrn;
ptrn.Location = i;
ptrn.Character = x[i];
pat[i] = ptrn;
}
QuickSortPattern(pat, 0, xLen - 1, freq);
}
int MatchShift(char* x, int ploc, int lShift, Pattern* pat)
{
int i, j;
int xLen = strlen(x);
for (; lShift < xLen; ++lShift)
{
i = ploc;
while (--i >= 0)
{
if ((j = (pat[i].Location - lShift)) < 0) continue;
if (pat[i].Character != x[j]) break;
}
if (i < 0) break;
}
return (lShift);
}
void PreAdaptedGs(char* x, int* adaptedGs, Pattern* pat)
{
int lShift, i, pLoc;
int xLen = strlen(x);
adaptedGs[0] = lShift = 1;
for (pLoc = 1; pLoc <= xLen; ++pLoc)
{
lShift = MatchShift(x, pLoc, lShift, pat);
adaptedGs[pLoc] = lShift;
}
for (pLoc = 0; pLoc < xLen; ++pLoc)
{
lShift = adaptedGs[pLoc];
while (lShift < xLen)
{
i = pat[pLoc].Location - lShift;
if (i < 0 || pat[pLoc].Character != x[i]) break;
++lShift;
lShift = MatchShift(x, pLoc, lShift, pat);
}
adaptedGs[pLoc] = lShift;
}
}
int* CalculateCharFrequency(char* x, char* y, int z)
{
int i;
int xLen = strlen(x);
int yLen = strlen(y);
int* freq = (int*)malloc(sizeof(int) * z);
for (i = 0; i < xLen; i++) freq[x[i]]++;
for (i = 0; i < yLen; i++) freq[y[i]]++;
return freq;
}
void PreQsBc(char* x, int* qsBc, int qsBcLen)
{
int xLen = strlen(x);
int i, m = xLen;
for (i = 0; i < qsBcLen; ++i)
qsBc[i] = m + 1;
for (i = 0; i < m; ++i)
qsBc[x[i]] = m - i;
}
void SetupOptimalSearch()
{
OrderPattern(_x, _pattern, _frequency);
PreQsBc(_x, _qsBc, _qsBcLen);
PreAdaptedGs(_x, _adaptedGs, _pattern);
}
void InitOptimalSearch(char* pattern, char* source)
{
_x = pattern;
_y = source;
_m = strlen(_x);
_n = strlen(_y);
_adaptedGs = (int*)malloc(sizeof(int) * (_m + 1));
_qsBcLen = 65536;
_qsBc = (int*)malloc(sizeof(int) * _qsBcLen);
_frequency = CalculateCharFrequency(_x, _y, 65536);
_pattern = (Pattern*)malloc(sizeof(Pattern) * _m);
}
Result FindAll()
{
int i, j, count = 0;
int* result = (int*)malloc(sizeof(int));
SetupOptimalSearch();
j = 0;
int jOld = 0;
while (j <= _n - _m)
{
i = 0;
while (i < _m && _pattern[i].Character == _y[j + _pattern[i].Location]) ++i;
if (i >= _m)
result = AddItemInArray(result, count++, j);
jOld = j;
if (j < _n - _m)
j += Max(_adaptedGs[i], _qsBc[_y[j + _m]]);
else
j += _adaptedGs[i];
}
Result retVal;
retVal.Comp = jOld;
retVal.Indexes = result;
return retVal;
}
Example
char* source = "GCATCGCAGAGAGTATACAGTACG";
char* pattern = "GCAGAGAG";
InitOptimalSearch(pattern, source);
Result result = FindAll();
Output
result {
Comp: 14
Indexes: { 5 }
}