Here is a sample HTML document that I am passing as MemoryStream in the code as given below
<h5>Sample Document </h5>
<h3> Present Tense </h3>
</p><p>The present tense is just as you have learned. You take the dictionary form of a verb, drop the 다, add the appropriate ending.
</p><p>먹다 - 먹 + 어요 = 먹어요 <br />
마시다 - 마시 + 어요 - 마시어요 - 마셔요. <br />
</p><p>This tense is used to represent what happens in the present. I eat. I drink. It is a general term for the present.
The Below mentioned program contains three functions
Main
ReadDocument
TestByteOffSet
Main function takes the above specifed HTML document as converts it to memoryStream and then passes it further to ReadDocument function which stores the result in a variable called docContent. Its a class level variable.
Then the main function takes a selected text using myRange.Text tries to find its index in the given document. Once the index is found its stored in intByteOffSet variable.
Now The third function TestByteOffSet tries to make sure that the Index stored in intByteOffSet is correct or not.
Here I am having issues when I try to get the string from byteOffSet, I do not receive the selected text.
Can someone please help me out with this.
The Source Code
using System;
using System.Collections.Generic;
using System.Text;
namespace MultiByteStringHandling
{
class Program
{
static void Main(string[] args)
{
FileStream fs = new FileStream(FileName, FileMode.Open);
BinaryReader br = new BinaryReader(fs);
byte[] bit = br.ReadBytes((int)fs.Length);
MemoryStream Mr = new MemoryStream(bit);
ReadDocument(Mr);
mshtml.IHTMLTxtRange CompleteRange =
_body.createTextRange().duplicate();
int intByteOffset = 0;
Regex reg = default(Regex);
try
{
// Get all of the text that is in between HTML tags.
string regSearchText = myRange.htmlText;
string strTemp = regSearchText + "\\s*";
string strExp = ">(([^<])*?)" + strTemp + "(([^<])*?)<";
string _cleanedSource = "";
_cleanedSource = CompleteRange.htmlText;
// Use regular expressions to find a collection of matches
//that match a certain pattern.
foreach (Match m in Regex.Matches(_cleanedSource, strExp,
RegexOptions.IgnoreCase))
{
Int32 ret = default(Int32);
Int32 index = default(Int32);
string strMatch = m.Value;
foreach (Match m2 in Regex.Matches(strMatch, strTemp,
RegexOptions.IgnoreCase))
{
// Increment counter when finding a match.
intCount += 1;
// If counter matches occurrence number, return
//source offset.
if (intCount == OccurenceNo)
{
//Source offset is the index of the overall
//match + index innerText Match.
int intCharOffset = m.Index + m2.Index;
System.Text.UTF8Encoding d = new
System.Text.UTF8Encoding();
// Using the SourceText will give an accurate
//byte offset.
intByteOffset = d.GetBytes(
_cleanedSource.Substring(0, intCharOffset)).Length;
}
}
}
}
catch (Exception ex)
{
throw ex;
}
finally
{
}
}
private void ReadDocument(Stream sD)
{
System.IO.MemoryStream ms = new System.IO.MemoryStream();
System.IO.BinaryWriter bw = new System.IO.BinaryWriter(ms);
bool hasMore = true;
sD.Position = 0;
using (System.IO.BinaryReader br = new System.IO.BinaryReader(sD))
{
while (hasMore)
{
byte[] buffer = br.ReadBytes(8192);
hasMore = buffer.Length > 0;
if (hasMore)
{
bw.Write(buffer);
}
}
}
byte[] docBuffer = ms.GetBuffer();
docContent = new byte[docBuffer.Length + 1];
Array.Copy(docBuffer, docContent, docBuffer.Length);
}
private bool TestByteOffset(TransparencyItemType transparency)
{
System.Text.UTF8Encoding encoding = default(System.Text.UTF8Encoding);
string byteOffsetLabel = null;
Int32 iLength = default(Int32);
Int32 offset = default(Int32);
if (((transparency.Label == null) == false))
{
iLength = Convert.ToInt32(transparency.Label.IEOffset.Length);
offset = Convert.ToInt32(transparency.Label.IEOffset.Offset);
}
else if (((transparency.Value == null) == false))
{
if(transparency.Value.ByteOffset!=null)
{
if (transparency.Value.ByteOffset.Offset != -1)
{
iLength = Convert.ToInt32(transparency.Value.ByteOffset.Length);
offset = Convert.ToInt32(transparency.Value.ByteOffset.Offset);
}
}
}
else
{
return false;
}
}