views:

719

answers:

1

Hi all, I'm trying to get the charset attribute in any HTML meta tag. (ie.< meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" >) Is there any way to do that in C++ under linux. I was using HTML tidy as a parser but I can't get that attribute to return me anything different from us-ascii (even if the encoding is utf-8)
this is the output I got: .4 Node: meta
Name attr: http-equiv
Value attr: Content-Type
Name attr: content
Value attr: text/html; charset=us-ascii

Thanks in advance,
Alejo

+1  A: 

As per request of Vinko Vrsalovic, here is the code that get that result:
void dumpNode( TidyNode tnod, int indent )
{
   TidyNode child;

   for ( child = tidyGetChild(tnod); child; child = tidyGetNext(child) )
   {
      ctmbstr name;
      switch ( tidyNodeGetType(child) )
     {
       case TidyNode_Root: name = "Root"; break;
       case TidyNode_DocType: name = "DOCTYPE"; break;
       case TidyNode_Comment: name = "Comment"; break;
       case TidyNode_ProcIns: name = "Processing Instruction"; break;
       case TidyNode_Text: name = "Text"; break;
       case TidyNode_CDATA: name = "CDATA"; break;
       case TidyNode_Section: name = "XML Section"; break;
       case TidyNode_Asp: name = "ASP"; break;
       case TidyNode_Jste: name = "JSTE"; break;
       case TidyNode_Php: name = "PHP"; break;
       case TidyNode_XmlDecl: name = "XML Declaration"; break;

       case TidyNode_Start:
       case TidyNode_End:
       case TidyNode_StartEnd:
       default:
       name = tidyNodeGetName( child );
       TidyAttr att = tidyAttrFirst(child);
       while (att)
       {
         std::cout < <"Name attr: " << tidyAttrName(att) << std::endl;
        std::cout< <"Value attr:"<< tidyAttrValue(att) << std::endl;
         att =         tidyAttrNext(att);
       }
      break;
      }
      assert( name != NULL );
      printf( "%d*.*%d%sNode: %s\n", indent, indent, " ", name );
      dumpNode( child, indent + 4 );
     }
}
void dumpHtml( TidyDoc tdoc)
{
      dumpNode( tidyGetHtml(tdoc),0 );
}

int main(int argc, char **argv) {
      std::string toReturn("");
      TidyBuffer output;
      TidyBuffer errbuf;
      int rc = -1;
      Bool ok;

      tidyBufInit(&output);
      tidyBufInit(&errbuf);

      TidyDoc tdoc = tidyCreate();
      ok = tidyOptSetBool( tdoc, TidyXhtmlOut, yes ); // Convert to XHTML
      if ( ok )
           rc = tidySetErrorBuffer( tdoc, &errbuf ); // Capture diagnostics
      if ( rc >= 0 )
           rc = tidyParseFile(tdoc, "fuebuena.html"); // Parse the input
      if ( rc >= 0 )
           rc = tidyCleanAndRepair( tdoc ); // Tidy it up!
      if (rc >= 0)
           dumpHtml(tdoc);

      return 0;
}

Alejo