When using libxml2 to parse non-UTF8 encoded HTML, htmlDocContentDumpOutput returns empty data.
URL used to test : http://www.asahi.com/national/update/0628/TKY201006280289.html and askmen.com
I don't know encoding used before hand.
NSData * data = [[NSData alloc] initWithContentsOfURL:_url options:0 error:&error];
CFStringEncoding cfenc = CFStringConvertNSStringEncodingToEncoding(NSUTF8StringEncoding);
CFStringRef cfencstr = CFStringConvertEncodingToIANACharSetName(cfenc);
const char *enc = CFStringGetCStringPtr(cfencstr, 0);
_doc = htmlReadDoc((xmlChar*)[data bytes],
"",
enc,
XML_PARSE_NOERROR | XML_PARSE_NOWARNING | XML_PARSE_NOBLANKS);
xmlBufferPtr buffer = xmlBufferCreateSize(1000);
xmlOutputBufferPtr buf = xmlOutputBufferCreateBuffer(buffer, NULL);
htmlDocContentDumpOutput(buf, _doc, NULL);
xmlOutputBufferFlush(buf);
NSString *usSting;
if (buffer->content) {
usSting = [[[NSString alloc] initWithBytes:(const void *)xmlBufferContent(buffer) length:xmlBufferLength(buffer) encoding:NSUTF8StringEncoding] autorelease];
}
NSLog(@"%@, %d", usSting, [data length]);