tags:

views:

29

answers:

0

When using libxml2 to parse non-UTF8 encoded HTML, htmlDocContentDumpOutput returns empty data.

URL used to test : http://www.asahi.com/national/update/0628/TKY201006280289.html and askmen.com

I don't know encoding used before hand.

NSData * data = [[NSData alloc] initWithContentsOfURL:_url options:0 error:&error];
CFStringEncoding cfenc = CFStringConvertNSStringEncodingToEncoding(NSUTF8StringEncoding);
CFStringRef cfencstr = CFStringConvertEncodingToIANACharSetName(cfenc);
const char *enc = CFStringGetCStringPtr(cfencstr, 0);
_doc = htmlReadDoc((xmlChar*)[data bytes],
      "",
      enc,
      XML_PARSE_NOERROR | XML_PARSE_NOWARNING | XML_PARSE_NOBLANKS);


xmlBufferPtr buffer = xmlBufferCreateSize(1000);
xmlOutputBufferPtr buf = xmlOutputBufferCreateBuffer(buffer, NULL);

htmlDocContentDumpOutput(buf, _doc, NULL);
xmlOutputBufferFlush(buf);

NSString *usSting;  
if (buffer->content) {
    usSting = [[[NSString alloc] initWithBytes:(const void *)xmlBufferContent(buffer) length:xmlBufferLength(buffer) encoding:NSUTF8StringEncoding] autorelease];
}
NSLog(@"%@, %d", usSting, [data length]);