I am using HTML Parser to develop an application. The code below is not able to get the entire set of tags in the page. There are some tags which are missed out and the attributes and text body of them are also missed out. Please help me to explain why is this happening.....or suggest me other way....
URL url = new URL("...");
PrintWriter pw=new PrintWriter(new FileWriter("HTMLElements.txt"));
URLConnection connection = url.openConnection();
InputStream is = connection.getInputStream();
InputStreamReader isr = new InputStreamReader(is);
BufferedReader br = new BufferedReader(isr);
HTMLEditorKit htmlKit = new HTMLEditorKit();
HTMLDocument htmlDoc = (HTMLDocument)htmlKit.createDefaultDocument();
HTMLEditorKit.Parser parser = new ParserDelegator();
HTMLEditorKit.ParserCallback callback = htmlDoc.getReader(0);
parser.parse(br, callback, true);
ElementIterator iterator = new ElementIterator(htmlDoc);
Element element;
while ((element = iterator.next()) != null)
{
AttributeSet attributes = element.getAttributes();
Enumeration e=attributes.getAttributeNames();
pw.println("Element Name :"+element.getName());
while(e.hasMoreElements())
{
Object key=e.nextElement();
Object val=attributes.getAttribute(key);
int startOffset = element.getStartOffset();
int endOffset = element.getEndOffset();
int length = endOffset - startOffset;
String text=htmlDoc.getText(startOffset, length);
pw.println("Key :"+key.toString()+" Value :"+val.toString()+"\r\n"+"Text :"+text+"\r\n");
}
}
}