Hi! I recently found out about n-grams and the cool possibility to compare frequency of phrases in a text body with it. Now I'm trying to make an vb.net app that simply gets an text body and returns a list of the most frequently used phrases (where n >= 2).
I found an C# example of how to generate a n-gram from a text body so I started out with converting the code to VB. The problem is that this code does create one gram per character instead of one per word. The delimiters I want to use for the words is: VbCrLf (new line), vbTab (tabs) and the following characters: !@#$%^&*()_+-={}|\:\"'?¿/.,<>’¡º×÷‘;«»[]
Does anyone have an idea how I can rewrite the following function for this purpose:
Friend Shared Function GenerateNGrams(ByVal text As String, ByVal gramLength As Integer) As String()
If text Is Nothing OrElse text.Length = 0 Then
Return Nothing
End If
Dim grams As New ArrayList()
Dim length As Integer = text.Length
If length < gramLength Then
Dim gram As String
For i As Integer = 1 To length
gram = text.Substring(0, (i) - (0))
If grams.IndexOf(gram) = -1 Then
grams.Add(gram)
End If
Next
gram = text.Substring(length - 1, (length) - (length - 1))
If grams.IndexOf(gram) = -1 Then
grams.Add(gram)
End If
Else
For i As Integer = 1 To gramLength - 1
Dim gram As String = text.Substring(0, (i) - (0))
If grams.IndexOf(gram) = -1 Then
grams.Add(gram)
End If
Next
For i As Integer = 0 To (length - gramLength)
Dim gram As String = text.Substring(i, (i + gramLength) - (i))
If grams.IndexOf(gram) = -1 Then
grams.Add(gram)
End If
Next
For i As Integer = (length - gramLength) + 1 To length - 1
Dim gram As String = text.Substring(i, (length) - (i))
If grams.IndexOf(gram) = -1 Then
grams.Add(gram)
End If
Next
End If
Return Tokeniser.ArrayListToArray(grams)
End Function