The code below uses a couple of regexes to do the parsing. Beware though that real world html might easily break it when inserting random spaces, tabs etz inside tags and code. The code below includes an array of test cases to run problem code through.
The idea here is to first clean up the html, then remove tags with closing tags and finally return the last tag available.
<html>
<head><title>Last Open HTML Tag</title>
<body>
<h1>Last Open HTML Tag</h1>
<?php
$htmlstrings[] ="<html>
<body>
<h1>
<b>aaa</b> bbbb
";
$htmlstrings[] ="<html>
<body>
<h3>test</h3>
<h1>
<b>aaa <i>test2</i></b> <i>test</i> bbbb
";
$htmlstrings[] = "<body>
<img src='' alt=
";
$htmlstrings[] = "<body>
< img src='' alt=
";
$num = 1;
foreach( $htmlstrings as $rawstring){
// First remove whitespace in tags
$string = preg_replace ( "/<\s*(\w)/", "<$1", $rawstring);
// $string = preg_replace ( "/<\s*/\s*(\w)/", "</$1", $string);
$real_matches = array();
// Find open html tag (<a ...)
if( preg_match( "/<(\w*)\W[^><]*$/", $string, $matches) > 0){
$real_matches = $matches;
// Find html tag with no end tag (<h1>...)
} else {
$newstrin = null;
while( true){
$newstring = preg_replace( "/<(\\w*)>[^<>]*<\\/\\1>/s", "", $string);
if( $newstring == $string){
break;
}
$string = $newstring;
}
preg_match( "/<(\\w*)>[^<>]*$/", $newstring, $matches);
$real_matches = $matches;
}
echo "<p>Parse $num\n";
$rawstring = preg_replace ( "/</is", "<", $rawstring);
$rawstring = preg_replace ( "/>/is", ">", $rawstring);
echo "<br>$rawstring\n";
foreach( $real_matches as $match){
$result = preg_replace ( "/</is", "<", $match);
$result = preg_replace ( "/>/is", ">", $result);
echo "<br>" . $result . "\n";
}
$num++;
echo "<br>LAST OPEN TAG: " . $matches[1] . "\n";
}
?>
</body>
</html>