views:

85

answers:

1
uri = 'http://www.nytimes.com/';
searchuri = 'http://www.google.com/search?';
searchuri += 'q='+ encodeURIComponent(uri) +'&btnG=Search+Directory&hl=en&cat=gwd%2FTop';
req = new XMLHttpRequest();
req.open('GET', searchuri, true);
req.onreadystatechange = function (aEvt) {
    if (req.readyState == 4) {
        if(req.status == 200) {
            searchcontents = req.responseText;
            myHTML = searchcontents;
            var tempDiv = document.createElement('div');
            tempDiv.innerHTML = myHTML.replace(/<script(.|\s)*?\/script>/g, '');
            parsedHTML = tempDiv;
            sitefound = sc_sitefound(uri, parsedHTML);
        }
    }
};
req.send(null);

function sc_sitefound(uri, parsedHTML) {
    alert(parsedHTML);
    gclasses = parsedHTML.getElementsByClassName('g');
    for (var gclass in gclasses) {
        atags = gclass.getElementsByTagName('a');
        alert(atags);
        tag1 = atags[0];
        htmlattribute1 =  tag1.getAttribute('html');
        if (htmlattribute1 == uri) {
            sitefound = htmlattribute1;
            return sitefound;
        }

    }
    return null;
}

parsedHTML is a XULElement
gclasses is an HTMLCollection

if there are many divs of class G in the Google Directory search results, why are the g classes empty?

+3  A: 
var tempDiv = document.createElement('div');

If you're in an XUL environment, that's not creating an HTML element node: it'll be an XUL element. Since the innerHTML property is exclusive to HTMLElement and not other XML Element​s, setting innerHTML on tempDiv will do nothing (other than adding a custom property containing the HTML string). Consequently there are no elements with class ‘g’ inside tempDiv... there are no elements at all inside it.

If you have a plain HTML document loaded in the browser, you could try using content.document.createElement to get an HTML wrapper element on which innerHTML will be available. This still isn't a brilliant way to parse a whole page of HTML because the document in question might have <head> content you can't put in a div, and HTTP headers that you'll be throwing away. Probably better to load the target file into an HTMLDocument object of its own. A good way to do that would be using an iframe. See this page for examples of both these approaches.

tempDiv.innerHTML = myHTML.replace(/<script(.|\s)*?\/script>/g, '');

It's seven shades of not-a-good-idea to process HTML with regex; this could go wrong in many ways when Google slightly change their page markup. Let the browser do the job of parsing instead. Setting innerHTML does not cause script elements to be executed straight away (futher DOM manipulations can though); you can pick out the unwanted script elements later, if you need to. With the XUL iframe approach you can simply disable JavaScript on the iframe.

for (var gclass in gclasses) {

The for...in loop is for use against Objects used as mappings. It should not be used for iterating a sequence (such as Array, NodeList or in this case HTMLCollection) as it doesn't do what you might expect. For iterating sequences, stick to the standard C-style for (var i= 0; i<sequence.length; i++) loop.

You could also do with adding var declarations for all your other local variables.

bobince