Optimal Mismatch Algorithm

This algorithm works by scanning pattern characters from the least frequent one to the most frequent one. Doing so one may hope to have a mismatch most of the times and thus to scan the whole text very quickly. One needs to know the frequencies of each of the character of the alphabet.



									/*****Please include following header files*****/
// string.h
// stack
/***********************************************/

/*****Please use following namespaces*****/
// std
/*****************************************/

typedef struct {
	int Comp;
	stack<int> Indexes;
} Result;

typedef struct {
	int Location;
	char Character;
} Pattern;

char *_x, *_y;
int _m, _n, _qsBcLen;
int *_adaptedGs, *_qsBc, *_frequency;
Pattern* _pattern;

static int Max(int a, int b) 
{
	return a > b ? a : b;
}

static int OptimalPatternCompare(Pattern pat1, Pattern pat2, int* freq)
{
	int fx = freq[pat1.Character] - freq[pat2.Character];
	return (fx != 0 ? (fx > 0 ? 1 : -1) : (pat2.Location - pat1.Location));
}

static void QuickSortPattern(Pattern* pat, int low, int n, int* freq)
{
	int lo = low;
	int hi = n;

	if (lo >= n) return;

	Pattern mid = pat[(lo + hi) / 2];

	while (lo < hi)
	{
		while (lo < hi && OptimalPatternCompare(pat[lo], mid, freq) < 0) lo++;
		while (lo < hi && OptimalPatternCompare(pat[hi], mid, freq) > 0) hi--;

		if (lo < hi)
		{
			Pattern temp = pat[lo];
			pat[lo] = pat[hi];
			pat[hi] = temp;
		}
	}

	if (hi < lo)
	{
		int temp = hi;
		hi = lo;
		lo = temp;
	}

	QuickSortPattern(pat, low, lo, freq);
	QuickSortPattern(pat, lo == low ? lo + 1 : lo, n, freq);
}

static void OrderPattern(char* x, Pattern* pat, int* freq)
{
	int xLen = strlen(x);
	for (int i = 0; i < xLen; ++i)
	{
		Pattern ptrn;
		ptrn.Location = i;
		ptrn.Character = x[i];
		pat[i] = ptrn;
	}

	QuickSortPattern(pat, 0, xLen - 1, freq);
}

static int MatchShift(char* x, int ploc, int lShift, Pattern* pat)
{
	int i, j;
	int xLen = strlen(x);

	for (; lShift < xLen; ++lShift)
	{
		i = ploc;
		while (--i >= 0)
		{
			if ((j = (pat[i].Location - lShift)) < 0) continue;
			if (pat[i].Character != x[j]) break;
		}
		if (i < 0) break;
	}

	return (lShift);
}

static void PreAdaptedGs(char* x, int* adaptedGs, Pattern* pat)
{
	int lShift, i, pLoc;
	int xLen = strlen(x);
	adaptedGs[0] = lShift = 1;

	for (pLoc = 1; pLoc <= xLen; ++pLoc)
	{
		lShift = MatchShift(x, pLoc, lShift, pat);
		adaptedGs[pLoc] = lShift;
	}

	for (pLoc = 0; pLoc < xLen; ++pLoc)
	{
		lShift = adaptedGs[pLoc];
		while (lShift < xLen)
		{
			i = pat[pLoc].Location - lShift;
			if (i < 0 || pat[pLoc].Character != x[i]) break;
			++lShift;
			lShift = MatchShift(x, pLoc, lShift, pat);
		}
		adaptedGs[pLoc] = lShift;
	}
}

static int* CalculateCharFrequency(char* x, char* y, int z)
{
	int i;
	int xLen = strlen(x);
	int yLen = strlen(y);
	int* freq = new int[z];
	for (i = 0; i < xLen; i++) freq[x[i]]++;
	for (i = 0; i < yLen; i++) freq[y[i]]++;
	return freq;
}

static void PreQsBc(char* x, int* qsBc, int qsBcLen)
{
	int xLen = strlen(x);
	int i, m = xLen;

	for (i = 0; i < qsBcLen; ++i)
		qsBc[i] = m + 1;

	for (i = 0; i < m; ++i)
		qsBc[x[i]] = m - i;
}

static void SetupOptimalSearch()
{
	OrderPattern(_x, _pattern, _frequency);
	PreQsBc(_x, _qsBc, _qsBcLen);
	PreAdaptedGs(_x, _adaptedGs, _pattern);
}

static void InitOptimalSearch(char* pattern, char* source)
{
	_x = pattern;
	_y = source;
	_m = strlen(_x);
	_n = strlen(_y);
	_adaptedGs = new int[_m + 1];
	_qsBcLen = 65536;
	_qsBc = new int[_qsBcLen];
	_frequency = CalculateCharFrequency(_x, _y, 65536);
	_pattern = new Pattern[_m];
}

static Result FindAll()
{
	int i, j;
	stack<int> result;
	SetupOptimalSearch();

	j = 0;
	int jOld = 0;
	while (j <= _n - _m)
	{
		i = 0;
		while (i < _m && _pattern[i].Character == _y[j + _pattern[i].Location]) ++i;

		if (i >= _m)
			result.push(j);

		jOld = j;
		if (j < _n - _m)
			j += Max(_adaptedGs[i], _qsBc[_y[j + _m]]);
		else
			j += _adaptedGs[i];
	}

	Result retVal;
	retVal.Comp = jOld;
	retVal.Indexes = result;

	return retVal;
}
								


Example

									char* source = "GCATCGCAGAGAGTATACAGTACG";
char* pattern = "GCAGAGAG";
InitOptimalSearch(pattern, source);
Result result = FindAll();
								


Output

									result {
	Comp: 14
	Indexes: { 5 }
}