In these kinds of situations you typically want to use whatever SAX interface the underlying library offers you, to traverse and rewrite the input XML (or XHTML) statefully and serially:
require 'nokogiri'
require 'CGI'
Nokogiri::XML::SAX::Parser.new(
Class.new(Nokogiri::XML::SAX::Document) {
def initialize first_p, last_p
@first_p, @last_p = first_p, last_p
end
def start_document
puts '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">'
end
def start_element name, attrs = []
attrs = Hash[*attrs]
@depth += 1 unless @depth.nil?
print '<div>' if name=='p' && attrs['id'] == @first_p
@depth = 1 if name=='p' && attrs['id'] == @last_p && @depth.nil?
print "<#{ [ name, attrs.collect { |k,v| "#{k}=\"#{CGI::escapeHTML(v)}\"" } ].flatten.join(' ') }>"
end
def end_element name
@depth -= 1 unless @depth.nil?
print "</#{name}>"
if @depth == 0
print '</div>'
@depth = nil
end
end
def cdata_block string
print "<![CDATA[#{CGI::escapeHTML(string)}]]>"
end
def characters string
print CGI::escapeHTML(string)
end
def comment string
print "<!--#{string}-->"
end
}.new('2', '4')
).parse(<<-HTML_END)
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<body>
<!-- comment -->
<![CDATA[
cdata goes here
]]>
"special" entities
<p id="1">A</p>
<p id="2">B</p>
<p id="3">C</p>
<p id="4">D</p>
<p id="5">E</p>
<emptytag/>
</body>
</html>
HTML_END
Alternatively, you can also use the DOM model interface (instead of the SAX interface) to load the entire document into memory (in the same way that you started doing in your original question), and then perform node manipulation (insertion and removal) as follows:
require 'rubygems'
require 'nokogiri'
doc = Nokogiri::HTML.parse(<<-HTML_END)
<html>
<body>
<p id='1'>A</p>
<p id='2'>B</p>
<p id='3'>C</p>
<p id='4'>D</p>
<p id='5'>E</p>
</body>
</html>
HTML_END
first_p = "2"
last_p = "4"
doc.css("p[id=\"#{first_p}\"] ~ p[id=\"#{last_p}\"]").each { |node|
div_node = nil
node.parent.children.each { |sibling_node|
if sibling_node.name == 'p' && sibling_node['id'] == first_p
div_node = Nokogiri::XML::Node.new('div', doc)
sibling_node.add_previous_sibling(div_node)
end
unless div_node.nil?
sibling_node.remove
div_node << sibling_node
end
if sibling_node.name == 'p' && sibling_node['id'] == last_p
div_node = nil
end
}
}
puts doc
Cheers,
V.