I have a set of strings and I need to find all all of the occurrences in an HTML document. Where the string occurs is important because I need to handle each case differently:
String is all or part of an attribute. e.g., the string is foo:
<input value="foo">
-> Add class ATTR to the element.String is the full text of an element. e.g.,
<button>foo</button>
-> Add class TEXT to the element.String is inline in the text of an element. e.g.,
<p>I love foo</p>
-> Wrap the text in a span tag with class TEXT.
Also, I need to match the longest string first. e.g., if I have foo and foobar, then <p>I love foobar</p>
should become <p>I love <span class="TEXT">foobar</span></p>
, not <p>I love <span class="TEXT">foo</span>bar</p>
.
The inline text is easy enough: Sort the strings descending by length and find and replace each in document.body.innerHTML
with <span class="TEXT">$1</span>
, although I'm not sure if that is the most efficient way to go.
For the attributes, I can do something like this:
sortedStrings.each(function(it) {
document.body.innerHTML.replace(new RegExp('(\S+?)="[^"]*'+escapeRegExChars(it)+'[^"]*"','g'),function(s,attr) {
$('[+attr+'*='+it+']').addClass('ATTR');
});
});
Again, that seems inefficient.
Lastly, for the full text elements, a depth first search of the document that compares the innerHTML
to each string will work, but for a large number of strings, it seems very inefficient.
Any answer that offers performance improvements gets an upvote :)
EDIT: I went with a modification on Bob's answer. delim
is an optional delimiter around the string (to differentiate it from normal text), and keys
is the list of strings.
function dfs(iterator,scope) {
scope = scope || document.body;
$(scope).children().each(function() {
return dfs(iterator,this);
});
return iterator.call(scope);
}
var escapeChars = /['\/.*+?|()[\]{}\\]/g;
function safe(text) {
return text.replace(escapeChars, '\\$1');
}
function eachKey(iterator) {
var key, lit, i, len, exp;
for(i = 0, len = keys.length; i < len; i++) {
key = keys[i].trim();
lit = (delim + key + delim);
exp = new RegExp(delim + '(' + safe(key) + ')' + delim,'g');
iterator(key,lit,exp);
}
}
$(function() {
keys = keys.sort(function(a,b) {
return b.length - a.length;
});
dfs(function() {
var a, attr, html, val, el = $(this);
eachKey(function(key,lit,exp) {
// check attributes
for(a in el[0].attributes) {
attr = el[0].attributes[a].nodeName;
val = el.attr(attr);
if(exp.test(val)) {
el.addClass(attrClass);
el.attr(attr,val.replace(exp,"$1"));
}
}
// check all content
html = el.html().trim();
if(html === lit) {
el.addClass(theClass);
el.html(key); // remove delims
} else if(exp.test(html)) {
// check partial content
el.html(html.replace(exp,wrapper));
}
});
});
});
Under the assumption that the traversal is the most expensive operation, this seems optimal, although improvements are still welcome.