The code below uses the HTML::TreeBuilder module, which is an appropriate tool for parsing HTML. Regular expressions are not.
#! /usr/bin/perl
use warnings;
use strict;
use HTML::TreeBuilder;
The test cases from your question:
my @cases = (
'<font color="#008080"><span style="background: #ffffff"></span></font>',
'<font color="#008080"> s</font>',
'<font></font>',
);
We'll use is_empty
as a predicate to the look_down
method of HTML::Element to find <font>
elements with no interesting content.
sub is_empty {
my($font) = @_;
my $is_interesting = sub {
for ($_[0]->content_list) {
return 1 if !ref($_) && /\S/;
}
};
!$font->look_down($is_interesting);
}
Finally the main loop. For each fragment, we create a new HTML::TreeBuilder
instance, remove empty <font>
elements, and trim the first-level text content of those that remain.
foreach my $html (@cases) {
my $tree = HTML::TreeBuilder->new_from_content($html);
$_->detach for $tree->guts->look_down(_tag => "font", \&is_empty);
my $result = "";
if ($tree->guts) {
foreach my $font ($tree->guts->look_down(_tag => "font")) {
$font->attr($_,undef) for $font->all_external_attr_names;
foreach my $text ($font->content_refs_list) {
next if ref $$text;
$$text =~ s/^\s+//;
$$text =~ s/\s+$//;
}
}
($result = $tree->guts->as_HTML) =~ s/\s+$//;
}
print "$result\n";
}
Output:
<font>s</font>
Making two passes is sloppy. The code could be improved:
#! /usr/bin/perl
use warnings;
use strict;
use HTML::TreeBuilder;
my @cases = (
'<font color="#008080"><span style="background: #ffffff"></span></font>',
'<font color="#008080"> s</font>',
'<font></font>',
);
foreach my $fragment (@cases) {
my $tree = HTML::TreeBuilder->new_from_content($fragment);
foreach my $font ($tree->guts->look_down(_tag => "font")) {
$font->detach, next
unless $font->look_down(sub { grep !ref && /\S/ => $_[0]->content_list });
$font->attr($_,undef) for $font->all_external_attr_names;
foreach my $text ($font->content_refs_list) {
next if ref $$text;
$$text =~ s/^\s+//;
$$text =~ s/\s+$//;
}
}
(my $cleaned = $tree->guts ? $tree->guts->as_HTML : "") =~ s/\s+$//;
print $cleaned, "\n";
}