Aho–Corasick Algorithm
Aho–Corasick algorithm is a string searching algorithm. It is a kind of dictionary-matching algorithm that locates elements of a finite set of strings (the "dictionary") within an input text. It matches all strings simultaneously.
$MaxStates = 6 * 50 + 10;
$MaxChars = 26;
$Out = array();
$FF = array();
$GF = array();
function BuildMatchingMachine($words, $lowestChar = 'a', $highestChar = 'z')
{
global $Out, $FF, $GF, $MaxStates, $MaxChars;
$Out = array_fill(0, $MaxStates, 0);
$FF = array_fill(0, $MaxStates, -1);
for ($i = 0; $i < $MaxStates; $i++)
{
for ($j = 0; $j < $MaxChars; $j++)
{
$GF[$i][$j] = -1;
}
}
$states = 1;
for ($i = 0; $i < count($words); $i++)
{
$keyword = $words[$i];
$currentState = 0;
for ($j = 0; $j < strlen($keyword); $j++)
{
$c = ord($keyword[$j]) - ord($lowestChar);
if ($GF[$currentState][$c] == -1)
{
$GF[$currentState][$c] = $states++;
}
$currentState = $GF[$currentState][$c];
}
$Out[$currentState] |= (1 << $i);
}
for ($c = 0; $c < $MaxChars; $c++)
{
if ($GF[0][$c] == -1)
{
$GF[0][$c] = 0;
}
}
$q = array();
for ($c = 0; $c <= ord($highestChar) - ord($lowestChar); $c++)
{
if ($GF[0][$c] != -1 && $GF[0][$c] != 0)
{
$FF[$GF[0][$c]] = 0;
$q[] = $GF[0][$c];
}
}
while (count($q))
{
$state = $q[0];
unset($q[0]);
$q = array_values($q);
for ($c = 0; $c <= ord($highestChar) - ord($lowestChar); $c++)
{
if ($GF[$state][$c] != -1)
{
$failure = $FF[$state];
while ($GF[$failure][$c] == -1)
{
$failure = $FF[$failure];
}
$failure = $GF[$failure][$c];
$FF[$GF[$state][$c]] = $failure;
$Out[$GF[$state][$c]] |= $Out[$failure];
$q[] = $GF[$state][$c];
}
}
}
return $states;
}
function FindNextState($currentState, $nextInput, $lowestChar = 'a')
{
global $GF, $FF;
$answer = $currentState;
$c = ord($nextInput) - ord($lowestChar);
while ($GF[$answer][$c] == -1)
{
$answer = $FF[$answer];
}
return $GF[$answer][$c];
}
function FindAllStates($text, $keywords, $lowestChar = 'a', $highestChar = 'z')
{
global $Out;
BuildMatchingMachine($keywords, $lowestChar, $highestChar);
$currentState = 0;
$retVal = array();
for ($i = 0; $i < strlen($text); $i++)
{
$currentState = FindNextState($currentState, $text[$i], $lowestChar);
if ($Out[$currentState] == 0)
continue;
for ($j = 0; $j < count($keywords); $j++)
{
if ($Out[$currentState] & (1 << $j))
{
if (count($retVal) == 0) {
$retVal[0] = $i - strlen($keywords[$j]) + 1;
}
else {
array_unshift($retVal, $i - strlen($keywords[$j]) + 1);
}
}
}
}
return $retVal;
}
Example
$keywords = array( "he", "she", "hers", "his" );
$text = "ahishers";
$states = FindAllStates($text, $keywords);
Output
4
3
4
1