views:

164

answers:

1

Hi guys,

I'm a mostly-newbie when it comes to web development (though not to programming in general) so pardon any incorrect terminology.

I want to build a script that, when added to an HTML page, detects each Hebrew word in the page and transforms that word into an HTML element, e.g. into a hyperlink with title.

So, the following:

<p>ראש הלשכה</p>

Is transformed into:

<p><a title="word 1" href="#">הלשכה</a> <a title="word 2" href="#">ראש</a></p>

Make sense?

So, I suppose the first order of business is detecting Hebrew words in a page. How would I go about doing this? I don't know where to start, outside of poking around jQuery documentation.

+3  A: 

Searching for a Hebrew word in a string is fairly simple. Use a regexp that matches a contiguous sequence of Hebrew code points:

/[\u05D0-\u05FF]+/

Since JS supports functional programming, we can easily write our own functions to walk the document tree, calling a function on each text node. First, a bit of scaffolding.

if (! window.assert) {
    window.dbgLvl = 1; // change this to 0 for production release
    window.assert=function(succeeded, msg) {
        if (dbgLvl && !succeeded) {
            if (!msg) msg = 'assertion failed';
            throw msg;
        }
    }
}

Next, we define a method to split strings into an array, including separators in the output.

/* String.separate is like String.split, but the result includes the 
   separators.

   These implementations of 'String.separate' will work for our purposes,
   but are buggy in general, due to differences in the implementation of
   String.split.

   The two misbehaviors we correct are including neither grouped patterns 
   nor empty strings in the result, though the latter is only corrected
   when the missing empty string is at the start or the end.
*/
if ('-'.split(/(-)/).length & 1) {
    assert('a'.split(/a/).length, 'split includes grouping but not empty strings');
    // split includes groups in result
    String.prototype.separate = function (separator) {
        if (typeof separator == 'string') {
            if (separator.charAt(0) != '(' 
                || separator.charAt(separator.length-1) != ')')
            {
                separator = new RegExp('(' + separator + ')', 'g');
            } else {
                separator = new RegExp(separator, 'g');
            }
        }
        return this.split(separator);
    }
} else {
    if ('a'.split(/a/).length) {
        // empty strings included, grouped aren't 
        String.prototype.separate = function (separator) {
            if (typeof separator == 'string') {
                separator = new RegExp(separator, 'g');
            }
            var fence = this.match(separator);
            if (!fence) {
                return [this];
            }
            var posts = this.split(separator);
            assert(posts.length = fence.length+1);
            var result = [], i;
            for (i=0; i<fence.length; ++i) {
                result.push(posts[i]);
                result.push(fence[i]);
            }
            result.push(posts[i]);
            return result;
        }
    } else {
        // neither empty strings nor groups are included. IE, you suck.
        String.prototype.separate = function (separator) {
            if (typeof separator == 'string') {
                separator = new RegExp(separator, 'g');
            }
            var fence = this.match(separator);
            if (!fence) {
                return [this];
            }
            var posts = this.split(separator);
            if (posts.length <= fence.length) {
                /* missing some posts. Assume that they are the first or 
                   last, though this won't be true in general.
                */
                if (posts.length < fence.length) {
                    posts.unshift('');
                    posts.push('');
                } else {
                    if (this.substring(0, fence[0].length) == fence[0]) {
                        posts.unshift('');
                    } else {
                        posts.push('');
                    }
                }
            }
            var result = [], i;
            for (i=0; i<fence.length; ++i) {
                result.push(posts[i]);
                result.push(fence[i]);
            }
            result.push(posts[i]);
            return result;
        }
    }
}

Next, some node predicates.

if (! window.Node) {
    window.Node={TEXT_NODE: 3};
} else if (typeof Node.TEXT_NODE == 'undefined') {
    Node.TEXT_NODE = 3;
}

function isTextNode(node) {return node.nodeType == Node.TEXT_NODE;}
function hasKids(node) {return node.childNodes && node.childNodes.length;}
function allNodes(node) {return true;}

Now the functions to walk the DOM.

/*
  forEachChild: pre-order traversal of document tree. Applies a function to some nodes, determined by the 'which' and 'descendInto' arguments.

Arguments:
  which  (function): Returns true if 'action' should be applied to a node.
  action (function): Takes a node and does something to it.
  parent (Node): The node to start from.
  descendInto (function, optional): By default, forEachChild will descend into every child that itself has children. Place additional restrictions by passing this argument.
*/
var forEachChild = (function() {
        /* the actual implementation is made a local function so that the
           optional parameter can be handled efficiently.
         */
        function _forEachChild(which, action, node, descendInto) {
            for (var child=node.firstChild; child; child=child.nextSibling) {
                if (which(child)) {
                    action(child);
                }
                if (hasKids(child) && descendInto(child)) {
                    _forEachChild(which, action, child, descendInto);
                }
            }
        }
        return function (which, action, node, descendInto) {
            if (!descendInto) {descendInto=allNodes}
            _forEachChild(which, action, node, descendInto);
        }
    })();

function forEachNode(which, action, descendInto) {
    return forEachChild(which, action, document, descendInto);
}

function forEachTextNode(action, descendInto) {
    return forEachNode(isTextNode, action, descendInto);
}

function forEachTextNodeInBody(action, descendInto) {
    return forEachChild(isTextNode, action, document.body, descendInto);
}

The last group of functions replace text in a text node that matches a pattern with a new node of your choosing. This group (well, the function returned by wrapText) hasn't been completely tested for cross-browser compatibility, including whether it handles text direction properly.

/* 
   wrapText replaces substrings in a text node with new nodes.

 Arguments:
   pattern (RegExp || string): If a RegExp, must be of the form: '/(...)/g'.
   replace (function): Takes a string and returns a Node to replace the string.

Returns a function that takes a text node.
*/
function wrapText(pattern, replace) {
    return function (node) {
        var chunks = node.nodeValue.separate(pattern);
        if (chunks.length < 2)
            return;
        var wordCount=0;
        var fragment = document.createDocumentFragment();
        var i;
        // don't bother adding first chunk if it's empty.
        if (chunks[0].length) {
            fragment.appendChild(document.createTextNode(chunks[0]));
        }
        for (i=1; i < chunks.length; i+=2) {
            fragment.appendChild(replace(chunks[i])); // †
            fragment.appendChild(document.createTextNode(chunks[i+1])); // ‡
        }
        // clean-up
        assert(i == chunks.length, 'even number of chunks in ['+chunks+'] when it should be odd.');
        /* chunks.length and i will always be odd, thus i == chunks.length
         * when the loop finishes. This means the last element is never
         * missed. 
         * Here's another way of thinking about this. Since the last 
         * (and first) chunk won't match the pattern, it won't be 
         * processed by the line †. The penultimate chunk, however, does
         * match. Assuming the loop condition is correct,the penultimate 
         * chunk must be processed by †, hence the last chunk is 
         * processed by ‡.
         */
        if (! chunks[i-1].length) {
            // last chunk is empty; remove it.
            fragment.removeChild(fragment.lastChild);
        }
        node.parentNode.replaceChild(fragment, node);
    }
}

/*
  createAnchorWrap wraps a string in an anchor node. createAnchorWrap also
  sets the title of the anchor.

Arguments:
  title (string || function, optional): The title for the anchor element. 
      If title is a function, it's called with the string to wrap. If 
      title is a string, wrapper will use a word counter for the title 
      function.

Returns a function that takes a string and returns an anchor element.
 */
function createAnchorWrap(title) {
    if (typeof title == 'string') {
        title=createWordCounter(title);
    } else if (!title) {
        title=createWordCounter();
    }
    return function(word) {
        var a = document.createElement('a');
        a.title=title(word);
        a.appendChild(document.createTextNode(word));
        return a;
    }
}

/*
  createWordCounter creates a word counter, which returns the number of 
  times it's been called (including the current call), prefixed by a string.

Arguments:
  pre (string, optional): prefix for return value.

Returns a function that takes a string (ignored) and returns a string.

 */
function createWordCounter(pre) {
    var wordCount=0;
    if (pre) {
        pre = pre.replace(/ *$/, ' ');
    } else {
        pre = 'word ';
    }
    return function(text) {
        return pre + wordCount;
    }
}

The last thing you have to do is start the process in (e.g.) a load handler or a script at the bottom of the page.

forEachTextNodeInBody(wrapText(/([\u05D0-\u05FF]+)/g,
                               createAnchorWrap()));

If you want to change the prefix for the title, pass the result of createWordCounter(...) to the createAnchorWrap.

outis
Ok, that's a start. So, Javascript has built-in support for RegEx. Alright, great. Now, that bit about finding the text in an HTML document...
Judah Himango
Ok, so you've written some Javascript functions now to walk the tree. Looks like I could use forEachTextNode(action) to somehow replace the text element into an achor element. Alright. I'll see what I can do. Thanks for the help so far.
Judah Himango
Note that using a JS library (jQuery, Prototype, MooTools ...) might still be a good idea.
outis
Hmmm. I tried the following: "הלשכה".match(new RegExp("/[\u05D0-\u05FF]+/")) and that returned false, no match. Did I do something wrong?
Judah Himango
JS supports RE literals. Try `"ראש הלשכה".match(/[\u05D0-\u05FF]+/g)`. In cases where you need to use the RegExp constructor (basically, when you need to interpolate variables), don't add delimiters: `new RegExp('[' + start + '-' + end + ']')`.
outis