Just don't parse HTML using regex. Read this: http://www.codinghorror.com/blog/2009/11/parsing-html-the-cthulhu-way.html
In JavaScript, you can turn HTML into DOM using the .innerHTML
property, and after that you can use other DOM methods to traverse it.
Simple example (needs Firebug):
var div = document.createElement('div');
var html = '<p>foo <span>bar</span><br /></p>';
div.innerHTML = html;
function scan(node, depth)
{
depth = depth || 0;
var is_tag = node.nodeType == 1;
var self_contained = false;
if (is_tag) {
self_contained = node.childNodes.length == 0;
var tag_name = node.tagName.toLowerCase();
console.log('<' + tag_name + (self_contained ? ' /' : '') + '>', depth);
} else {
console.log(node.data);
}
for (var i = 0, n = node.childNodes.length; i < n; i++) {
scan(node.childNodes[i], depth + 1);
}
if (!self_contained && is_tag) {
console.log('</' + tag_name + '>', depth);
}
}
scan(div);
Output:
<div> 0
<p> 1
foo
<span> 2
bar
</span> 2
<br /> 2
</p> 1
</div> 0
You could also modify this to output attributes and use the depth argument for indentation.