I use htmlparser 1.6 to parse web sites.
The problem is that when I parse pdf web sites, I obtain in the output file strange characters like
ØÇÁÖÜ/:?ÖQØ?WÕWÏ
This is a fragment of my code :
try {
parser = new Parser ();
if (1 < args.length)
filter = new TagNameFilter (args[1]);
else
{
filter = null;
parser.setFeedback (Parser.STDOUT);
Parser.getConnectionManager ().setMonitor (parser);
}
Parser.getConnectionManager ().setRedirectionProcessingEnabled (true);
Parser.getConnectionManager ().setCookieProcessingEnabled (true);
// Here the pdf web site
parser.setResource ("http://hal.archives-ouvertes.fr" +
"/docs/00/16/76/78/PDF /27_Bendaoud.pdf");
NodeList list = parser.parse(filter);
NodeIterator i = list.elements ();
while (i.hasMoreNodes ())
processMyNodes(i.nextNode ());
}
catch (EncodingChangeException ece) {
try {
parser.reset ();
NodeList list = parser.parse(filter);
for (NodeIterator i = list.elements (); i.hasMoreNodes (); )
processMyNodes (i.nextNode ());
}
catch (ParserException e) {
e.printStackTrace ();
}
}
catch (ParserException e) {
e.printStackTrace ();
}