A Proposed Solution
Man, this is a curse of mine! I apparently cannot walk away from a problem without spending up-to-and-including an unreasonable amount of time on it.
I thought about this. I thought about HTML Tidy, and maybe it would work, but I had trouble wrapping my head around it.
So, I wrote my own solution.
I tested this on your input and on some other input that I threw together myself. It seems to work pretty well. Surely there are holes in it, but it might provide you with a starting point.
Anyway, my approach was this:
- Encapsulate the notion of a single word in an HTML document using a class that includes information about that word's position in the HTML document hierarchy, up to a given "top". This I have implemented in the
HtmlWord
class below.
- Create a class that is capable of writing a single line composed of these HTML words above, such that start-element and end-element tags are added in the appropriate places. This I have implemented in the
HtmlLine
class below.
- Write a few extension methods to make these classes immediately and intuitively accessible straight from an
HtmlAgilityPack.HtmlNode
object. These I have implemented in the HtmlHelper
class below.
Am I crazy for doing all this? Probably, yes. But, you know, if you can't figure out any other way, you can give this a try.
Here's how it works with your sample input:
var document = new HtmlDocument();
document.LoadHtml("<p><strong>Lorem ipsum dolor sit amet, <em>consectetur adipiscing</em></strong> elit.</p>");
var nodeToSplit = document.DocumentNode.SelectSingleNode("p");
var lines = nodeToSplit.SplitIntoLines(3);
foreach (var line in lines)
Console.WriteLine(line.ToString());
Output:
<p><strong>Lorem ipsum dolor </strong></p>
<p><strong>sit amet, <em>consectetur </em></strong></p>
<p><strong><em>adipiscing </em></strong>elit. </p>
And now for the code:
HtmlWord class
using System;
using System.Collections.Generic;
using System.Linq;
using HtmlAgilityPack;
public class HtmlWord {
public string Text { get; private set; }
public HtmlNode[] NodeStack { get; private set; }
// convenience property to display list of ancestors cleanly
// (for ease of debugging)
public string NodeList {
get { return string.Join(", ", NodeStack.Select(n => n.Name).ToArray()); }
}
internal HtmlWord(string text, HtmlNode node, HtmlNode top) {
Text = text;
NodeStack = GetNodeStack(node, top);
}
private static HtmlNode[] GetNodeStack(HtmlNode node, HtmlNode top) {
var nodes = new Stack<HtmlNode>();
while (node != null && !node.Equals(top)) {
nodes.Push(node);
node = node.ParentNode;
};
return nodes.ToArray();
}
}
HtmlLine class
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Xml;
using HtmlAgilityPack;
[Flags()]
public enum NodeChange {
None = 0,
Dropped = 1,
Added = 2
}
public class HtmlLine {
private List<HtmlWord> _words;
public IList<HtmlWord> Words {
get { return _words.AsReadOnly(); }
}
public int WordCount {
get { return _words.Count; }
}
public HtmlLine(IEnumerable<HtmlWord> words) {
_words = new List<HtmlWord>(words);
}
private static NodeChange CompareNodeStacks(HtmlWord x, HtmlWord y, out HtmlNode[] droppedNodes, out HtmlNode[] addedNodes) {
var droppedList = new List<HtmlNode>();
var addedList = new List<HtmlNode>();
// traverse x's NodeStack backwards to see which nodes
// do not include y (and are therefore "finished")
foreach (var node in x.NodeStack.Reverse()) {
if (!Array.Exists(y.NodeStack, n => n.Equals(node)))
droppedList.Add(node);
}
// traverse y's NodeStack forwards to see which nodes
// do not include x (and are therefore "new")
foreach (var node in y.NodeStack) {
if (!Array.Exists(x.NodeStack, n => n.Equals(node)))
addedList.Add(node);
}
droppedNodes = droppedList.ToArray();
addedNodes = addedList.ToArray();
NodeChange change = NodeChange.None;
if (droppedNodes.Length > 0)
change &= NodeChange.Dropped;
if (addedNodes.Length > 0)
change &= NodeChange.Added;
// could maybe use this in some later revision?
// not worth the effort right now...
return change;
}
public override string ToString() {
if (WordCount < 1)
return string.Empty;
var lineBuilder = new StringBuilder();
using (var lineWriter = new StringWriter(lineBuilder))
using (var xmlWriter = new XmlTextWriter(lineWriter)) {
var firstWord = _words[0];
foreach (var node in firstWord.NodeStack) {
xmlWriter.WriteStartElement(node.Name);
foreach (var attr in node.Attributes)
xmlWriter.WriteAttributeString(attr.Name, attr.Value);
}
xmlWriter.WriteString(firstWord.Text + " ");
for (int i = 1; i < WordCount; ++i) {
var previousWord = _words[i - 1];
var word = _words[i];
HtmlNode[] droppedNodes;
HtmlNode[] addedNodes;
CompareNodeStacks(
previousWord,
word,
out droppedNodes,
out addedNodes
);
foreach (var dropped in droppedNodes)
xmlWriter.WriteEndElement();
foreach (var added in addedNodes) {
xmlWriter.WriteStartElement(added.Name);
foreach (var attr in added.Attributes)
xmlWriter.WriteAttributeString(attr.Name, attr.Value);
}
xmlWriter.WriteString(word.Text + " ");
if (i == _words.Count - 1) {
foreach (var node in word.NodeStack)
xmlWriter.WriteEndElement();
}
}
}
return lineBuilder.ToString();
}
}
HtmlHelper static class
using System;
using System.Collections.Generic;
using System.Linq;
using HtmlAgilityPack;
public static class HtmlHelper {
public static IList<HtmlLine> SplitIntoLines(this HtmlNode node, int wordsPerLine) {
var lines = new List<HtmlLine>();
var words = node.GetWords(node.ParentNode);
for (int i = 0; i < words.Count; i += wordsPerLine) {
lines.Add(new HtmlLine(words.Skip(i).Take(wordsPerLine)));
}
return lines.AsReadOnly();
}
public static IList<HtmlWord> GetWords(this HtmlNode node, HtmlNode top) {
var words = new List<HtmlWord>();
if (node.HasChildNodes) {
foreach (var child in node.ChildNodes)
words.AddRange(child.GetWords(top));
} else {
var textNode = node as HtmlTextNode;
if (textNode != null && !string.IsNullOrEmpty(textNode.Text)) {
string[] singleWords = textNode.Text.Split(
new string[] {" "},
StringSplitOptions.RemoveEmptyEntries
);
words.AddRange(
singleWords
.Select(w => new HtmlWord(w, node.ParentNode, top)
)
);
}
}
return words.AsReadOnly();
}
}
Conclusion
Just to reiterate: this is a thrown-together solution; I'm sure it has problems. I present it only as a starting point for you to consider -- again, if you're unable to get the behavior you want through other means.