Thanks in advance for your consideration
I need to convert inline css style attributes to their HTML tag equivelants. The solution I have works but runs VERY slowly using the Microsoft .Net Regex namespace and long documents (~40 pages of html). I've tried several variations but with no useful results. I've done a little wrapping around executing the expressions but in the end it's just the built-in regex Replace method that gets called.
I'm sure I'm abusing the greediness of the regex but I'm not sure of a way around it to achieve what I want using a single regex.
I want to be able to run the following unit tests:
[Test]
public void TestCleanReplacesFontWeightWithB()
{
string html = "<font style=\"font-weight:bold\">Bold Text</font>";
html = Q4.PrWorkflow.Helper.CleanFormatting(html);
Assert.AreEqual("<b>Bold Text</b>", html);
}
[Test]
public void TestCleanReplacesMultipleAttributesFontWeightWithB()
{
string html = "<font style=\"font-weight:bold; color: blue; \">Bold Text</font>";
html = Q4.PrWorkflow.Helper.CleanFormatting(html);
Assert.AreEqual("<b>Bold Text</b>", html);
}
[Test]
public void TestCleanReplaceAttributesBoldAndUnderlineWithHtml()
{
string html = "<span style=\"font-weight:bold; color: blue; text-decoration: underline; \">Bold Text</span>";
html = Q4.PrWorkflow.Helper.CleanFormatting(html);
Assert.AreEqual("<u><b>Bold Text</b></u>", html);
}
[Test]
public void TestCleanReplaceAttributesBoldUnderlineAndItalicWithHtml()
{
string html = "<span style=\"font-weight:bold; color: blue; font-style: italic; text-decoration: underline; \">Bold Text</span>";
html = Q4.PrWorkflow.Helper.CleanFormatting(html);
Assert.AreEqual("<u><b><i>Bold Text</i></b></u>", html);
}
[Test]
public void TestCleanReplacesFontWeightWithSpaceWithB()
{
string html = "<font size=\"10\" style=\"font-weight: bold\">Bold Text</font>";
html = Q4.PrWorkflow.Helper.CleanFormatting(html);
Assert.AreEqual("<b>Bold Text</b>", html);
}
The regular expresion I am using to achieve this logic works but is VERY slow. The regex in the c# code looks like this:
public static IReplacePattern IncludeInlineItalicToITag(ICleanUpHtmlFactory factory)
{
return factory.CreateReplacePattern("(<(span|font) .*?style=\".*?font-style:\\s*italic[^>]*>)(.*?)</\\2>", "$1<i>$3</i></$2>");
}
public static IReplacePattern IncludeInlineBoldToBTag(ICleanUpHtmlFactory factory)
{
return factory.CreateReplacePattern("(<(span|font) .*?style=\".*?font-weight:\\s*bold[^>]*>)(.*?)</\\2>", "$1<b>$3</b></$2>");
}
public static IReplacePattern IncludeInlineUnderlineToUTag(ICleanUpHtmlFactory factory)
{
return factory.CreateReplacePattern("(<(span|font) .*?style=\".*?text-decoration:\\s*underline[^>]*>)(.*?)</\\2>", "$1<u>$3</u></$2>");
}