views:

30

answers:

1

Where keys are represented by element type and values are represented by #foo and .bar (spaced and ready for explode()). Is it possible, or does something exist for it?

I know that this question might incite some wrath, and I'm hoping nobody links to that post about parsing HTML, but I'm hoping it's not impossible. Thanks for the help.

Addendum: Ideally, PHP would be used, since it's the only scripting language I know.

A: 

Thanks for all the help :\ This function will convert a body of html into a multidimensional array that contains attributes, classes and ids.

<?php

function htmlArrayer($raw_html){

    $match_open = '/\<(?!\/)(.+?)\>/';
    $match_closed = '/\<\/(.+?)\>/';
    $match_open_or_closed = '/(\<(\/?[^\>]+)\>)/';
    $match_scripts = '@<script[^>]*?>.*?</script>@si';
    $match_styles = '@<style[^>]*?>.*?</style>@siU';
    $match_element = '/(?<=\<\s*)[a-zA-Z](?=\s+)/';
    $match_comments = '/<!--.*?-->/si';
    $match_class = '/(?<=(class\=")).+?(?=")/';
    $match_id = '/(?<=(id\=")).+?(?=")/';

    $raw_html = preg_replace($match_scripts, '', $raw_html);
    $raw_html = preg_replace($match_styles, '', $raw_html);
    $raw_html = preg_replace($match_comments, '', $raw_html);
    $raw_html = str_replace('>', '> ', $raw_html);
    $raw_html = str_replace('<', ' <', $raw_html);
    $raw_html = str_replace('!--', '!-- ', $raw_html);
    $raw_html = preg_replace('/[ \t\r\n]/', ' ', $raw_html);
    preg_match_all($match_open_or_closed, $raw_html, $matches);
    $matches[2] = checkTags($matches[2]);   
    $html_array = htmlToArray($matches[2], 0);

    return $html_array;

}

function checkTags($htmlArray) {
    $valid_tags_array = array('html', 'body', 'div', 'span', 'applet', 'object', 'iframe', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'blockquote', 'pre', 'a', 'abbr', 'acronym', 'address', 'big', 'cite', 'code', 'del', 'dfn', 'em', 'font', 'img', 'ins', 'kbd', 'q', 's', 'samp', 'small', 'strike', 'strong', 'sub', 'sup', 'tt', 'var', 'b', 'u', 'i', 'center', 'dl', 'dt', 'dd', 'ol', 'ul', 'li','fieldset', 'form', 'label', 'legend', 'table', 'caption', 'tbody', 'tfoot', 'thead', 'tr', 'th', 'td');

    foreach($htmlArray as $key => $element) {
        $notfound = true;
        $element = explode(' ', trim($element));

        foreach($valid_tags_array as $tag) {
            if($tag == $element[0] || '/' . $tag == $element[0]){
                $notfound = false;
                break;
            }
        }

        if($notfound != false){
            $htmlArray[$key] = 'br';
        }
    }

    return $htmlArray;
}

function htmlToArray($untiered_array, $index){
    $untiered_element = explode(' ', $untiered_array[$index]);
    if($untiered_element[0] == 'br'){
        $index++;
        $untiered_element = explode(' ', $untiered_array[$index]);
    }

    $css_string = attrToCSS($untiered_array[$index]);
    $untiered_array[$index] = $untiered_element[0] . ' ' . $css_string;

    $new_array_layer = array($untiered_array[$index]);
    $tier_check = 0;

    // Loops through every remaining element from the $index forward
    for($i = $index + 1; $untiered_array[$i] != '/' . $untiered_element[0] || $tier_check != 0; $i++){
        $one_way_elements = array('br', 'img', 'area', 'base', 'basefront', 'hr', 'input', 'link', 'meta', 'col', 'embed', 'param');
        $element_check = true;
        $next_element_name = explode(' ', $untiered_array[$i]);

        foreach($one_way_elements as $this_element){
            if($this_element == $next_element_name[0]){
                $element_check = false;
                break;
            }
        }

        // if it *is* the self-closing type, create a 1d array for it.
        if($element_check == false) {
            $tier_check++;
            if($tier_check == 1) {
                $untiered_standalone = explode(' ', $untiered_array[$i]);
                $css_string = attrToCSS($untiered_array[$i]);
                $untiered_array[$i] = $untiered_standalone[0] . ' ' . $css_string;

                $new_array_layer[] = array($untiered_array[$i]);
            }
            $tier_check--;
        }

        // If the following element is not preceded by a '/' and is not self-closing, continue 
        if((strpos($untiered_array[$i], '/') != 0 || strpos($untiered_array[$i], '/') === false) && $element_check == true){
            $tier_check++;

            // If the next element is only one tier above this element (as in its direct child), reiterate
            if($tier_check == 1){       
                $new_array_layer[] = htmlToArray($untiered_array, $i);
            }                       
        }

        // If the next element *does* begin with a closing slash
        if(strpos($untiered_array[$i], '/') === 0){
            $tier_check--;
        }
    }

    return $new_array_layer;
}

function attrToCSS($attr_string){

    preg_match_all('/(?<=(class\=")).+?(?=")/', $attr_string, $class_value);
    $class_value_string = $class_value[0][0];

    preg_match_all('/(?<=(id\=")).+?(?=")/', $attr_string, $id_value);
    $id_value_string = $id_value[0][0];

    if($class_value_string != ''){
        $class_value_array = explode(' ', $class_value_string);

        foreach($class_value_array as $index => $class) {
            $class_value_array[$index] = '.' . $class;
        }
        $class_id_string = implode(' ', $class_value_array);
    } 

    if ($id_value_string != '') {
        $class_id_string = '#' . $id_value_string;
    }

    return $class_id_string;
}


?>
dclowd9901