_documentContent
contains the whole document as html view source.
patternToFind
contains text to be searched in _documentContent
.
Code snippet below works fine if language is English. The same code however doesn't works at all when it encounters a language like Korean.
Sample DocumentPresent Tense
The present tense is just as you have learned. You take the dictionary form of a verb, drop the 다, add the appropriate ending.
먹다 - 먹 + 어요 = 먹어요
마시다 - 마시 + 어요 - 마시어요 - 마셔요.
This tense is used to represent what happens in the present. I eat. I drink. It is a general term for the present.
When I am trying to find 먹 the code belows fails.
can someone please suggest some solution to this
using System;
using System.Collections.Generic;
using System.Text;
namespace MultiByteStringHandling
{
class Program
{
static void Main(string[] args)
{
string _documentContent = @"먹다 - 먹 + 어요 = 먹어요";
byte[] patternToFind = Encoding.UTF8.GetBytes("먹");
byte[] DocumentBytes = Encoding.UTF8.GetBytes(_documentContent);
int intByteOffset = indexOf(DocumentBytes, patternToFind);
Console.WriteLine(intByteOffset.ToString());
}
public int indexOf(byte[] data, byte[] pattern)
{
int[] failure = computeFailure(pattern);
int j = 0;
if (data.Length == 0) return 0;
for (int i = 0; i < data.Length; i++)
{
while (j > 0 && pattern[j] != data[i])
{
j = failure[j - 1];
}
if (pattern[j] == data[i])
{
j++;
}
if (j == pattern.Length)
{
return i - pattern.Length + 1;
}
}
return -1;
}
/**
* Computes the failure function using a boot-strapping process,
* where the pattern is matched against itself.
*/
private int[] computeFailure(byte[] pattern)
{
int[] failure = new int[pattern.Length];
int j = 0;
for (int i = 1; i < pattern.Length; i++)
{
while (j > 0 && pattern[j] != pattern[i])
{
j = failure[j - 1];
}
if (pattern[j] == pattern[i])
{
j++;
}
failure[i] = j;
}
return failure;
}
}
}