Aho–Corasick Algorithm

Aho–Corasick algorithm is a string searching algorithm. It is a kind of dictionary-matching algorithm that locates elements of a finite set of strings (the "dictionary") within an input text. It matches all strings simultaneously.



									/*****Please include following header files*****/
// string.h
// stdlib.h
/***********************************************/

const int MaxStates = 6 * 50 + 10;
const int MaxChars = 26;

int Out[MaxStates];
int FF[MaxStates];
int GF[MaxStates][MaxChars];

int* AddItemInArray(int* arr, int count, int item) {
	int* newArr = (int*)malloc(sizeof(int) * (count + 1));

	for (int i = 0; i < count; ++i) {
		newArr[i] = arr[i];
	}

	newArr[count] = item;

	return newArr;
}

int* InsertItemInArray(int* arr, int count, int index, int item) {
	int* newArr = (int*)malloc(sizeof(int) * (count + 1));

	for (int i = 0; i < index; ++i) {
		newArr[i] = arr[i];
	}

	newArr[index] = item;

	for (int i = index; i < count; ++i) {
		newArr[i + 1] = arr[i];
	}

	return newArr;
}

int* RemoveItemFromArray(int* arr, int count, int index) {
	int* newArr = (int*)malloc(sizeof(int) * (count - 1));

	for (int i = 0; i < index; ++i) {
		newArr[i] = arr[i];
	}

	for (int i = index + 1; i < count; ++i) {
		newArr[i - 1] = arr[i];
	}

	return newArr;
}

int BuildMatchingMachine(const char** words, int wordsCount, char lowestChar = 'a', char highestChar = 'z')
{
	memset(Out, 0, sizeof Out);
	memset(FF, -1, sizeof FF);
	memset(GF, -1, sizeof GF);

	int states = 1;

	for (int i = 0; i < wordsCount; ++i)
	{
		const char* keyword = words[i];
		int currentState = 0;

		for (int j = 0; j < strlen(keyword); ++j)
		{
			int c = keyword[j] - lowestChar;

			if (GF[currentState][c] == -1)
			{
				GF[currentState][c] = states++;
			}

			currentState = GF[currentState][c];
		}

		Out[currentState] |= (1 << i);
	}

	for (int c = 0; c < MaxChars; ++c)
	{
		if (GF[0][c] == -1)
		{
			GF[0][c] = 0;
		}
	}

	int* q = (int*)malloc(sizeof(int));
	int qSize = 0;
	for (int c = 0; c <= highestChar - lowestChar; ++c)
	{
		if (GF[0][c] != -1 && GF[0][c] != 0)
		{
			FF[GF[0][c]] = 0;
			q = AddItemInArray(q, qSize++, GF[0][c]);
		}
	}

	while (qSize)
	{
		int state = q[0];
		q = RemoveItemFromArray(q, qSize--, 0);

		for (int c = 0; c <= highestChar - lowestChar; ++c)
		{
			if (GF[state][c] != -1)
			{
				int failure = FF[state];

				while (GF[failure][c] == -1)
				{
					failure = FF[failure];
				}

				failure = GF[failure][c];
				FF[GF[state][c]] = failure;
				Out[GF[state][c]] |= Out[failure];
				q = AddItemInArray(q, qSize++, GF[state][c]);
			}
		}
	}

	return states;
}

int FindNextState(int currentState, char nextInput, char lowestChar = 'a')
{
	int answer = currentState;
	int c = nextInput - lowestChar;

	while (GF[answer][c] == -1)
	{
		answer = FF[answer];
	}

	return GF[answer][c];
}

int* FindAllStates(const char* text, const char** keywords, int keywordsCount, char lowestChar = 'a', char highestChar = 'z') {
	BuildMatchingMachine(keywords, keywordsCount, lowestChar, highestChar);

	int currentState = 0;
	int* retVal = (int*)malloc(sizeof(int));;
	int retValSize = 0;

	for (int i = 0; i < strlen(text); ++i)
	{
		currentState = FindNextState(currentState, text[i], lowestChar);

		if (Out[currentState] == 0)
			continue;

		for (int j = 0; j < keywordsCount; ++j)
		{
			if (Out[currentState] & (1 << j))
			{
				retVal = InsertItemInArray(retVal, retValSize++, 0, i - strlen(keywords[j]) + 1);
			}
		}
	}

	return retVal;
}
								


Example

									const char* keywords[] = { "he" , "she",  "hers", "his" };

const char* text = "ahishers";

int* states = FindAllStates(text, keywords, 4);
								


Output

									4
3
4
1