Aho–Corasick Algorithm
Aho–Corasick algorithm is a string searching algorithm. It is a kind of dictionary-matching algorithm that locates elements of a finite set of strings (the "dictionary") within an input text. It matches all strings simultaneously.
Private Const MaxStates As Integer = 6 * 50 + 10
Private Const MaxChars As Integer = 26
Private Shared Out As Integer() = New Integer(MaxStates - 1) {}
Private Shared FF As Integer() = New Integer(MaxStates - 1) {}
Private Shared GF As Integer(,) = New Integer(MaxStates - 1, MaxChars - 1) {}
Private Shared Function BuildMatchingMachine(words As String(), Optional lowestChar As Char = "a"c, Optional highestChar As Char = "z"c) As Integer
Out = Enumerable.Repeat(0, Out.Length).ToArray()
FF = Enumerable.Repeat(-1, FF.Length).ToArray()
For i As Integer = 0 To MaxStates - 1
For j As Integer = 0 To MaxChars - 1
GF(i, j) = -1
Next
Next
Dim states As Integer = 1
For i As Integer = 0 To words.Length - 1
Dim keyword As String = words(i)
Dim currentState As Integer = 0
For j As Integer = 0 To keyword.Length - 1
Dim c As Integer = Convert.ToInt32(keyword(j)) - Convert.ToInt32(lowestChar)
If GF(currentState, c) = -1 Then
GF(currentState, c) = System.Math.Max(System.Threading.Interlocked.Increment(states), states - 1)
End If
currentState = GF(currentState, c)
Next
Out(currentState) = Out(currentState) Or (1 << i)
Next
For c As Integer = 0 To MaxChars - 1
If GF(0, c) = -1 Then
GF(0, c) = 0
End If
Next
Dim q As New List(Of Integer)()
For c As Integer = 0 To Convert.ToInt32(highestChar) - Convert.ToInt32(lowestChar)
If GF(0, c) <> -1 AndAlso GF(0, c) <> 0 Then
FF(GF(0, c)) = 0
q.Add(GF(0, c))
End If
Next
While Convert.ToBoolean(q.Count)
Dim state As Integer = q(0)
q.RemoveAt(0)
For c As Integer = 0 To Convert.ToInt32(highestChar) - Convert.ToInt32(lowestChar)
If GF(state, c) <> -1 Then
Dim failure As Integer = FF(state)
While GF(failure, c) = -1
failure = FF(failure)
End While
failure = GF(failure, c)
FF(GF(state, c)) = failure
Out(GF(state, c)) = Out(GF(state, c)) Or Out(failure)
q.Add(GF(state, c))
End If
Next
End While
Return states
End Function
Private Shared Function FindNextState(currentState As Integer, nextInput As Char, Optional lowestChar As Char = "a"c) As Integer
Dim answer As Integer = currentState
Dim c As Integer = Convert.ToInt32(nextInput) - Convert.ToInt32(lowestChar)
While GF(answer, c) = -1
answer = FF(answer)
End While
Return GF(answer, c)
End Function
Public Shared Function FindAllStates(text As String, keywords As String(), Optional lowestChar As Char = "a"c, Optional highestChar As Char = "z"c) As List(Of Integer)
BuildMatchingMachine(keywords, lowestChar, highestChar)
Dim currentState As Integer = 0
Dim retVal As New List(Of Integer)()
For i As Integer = 0 To text.Length - 1
currentState = FindNextState(currentState, text(i), lowestChar)
If Out(currentState) = 0 Then
Continue For
End If
For j As Integer = 0 To keywords.Length - 1
If Convert.ToBoolean(Out(currentState) And (1 << j)) Then
retVal.Insert(0, i - keywords(j).Length + 1)
End If
Next
Next
Return retVal
End Function
Example
Dim keywords As String() = {"he", "she", "hers", "his"}
Dim text As String = "ahishers"
Dim states As List(Of Integer) = FindAllStates(text, keywords)
Output
4
3
4
1