views:

145

answers:

2

hello I have this function in Delphi 2009 /2010

It returns garbage, now if I change the char,pchar types to Ansichar,Pansichar it returns the text but all foreign unicode text is garbage. it drive me banana I have been trying all kind of stuff for 2 days now I thought I understoff this unicode crap but I guess I do not Help please thanks Philippe Watel

function GetInetFileAsString(const fileURL: string): string;
const
  C_BufferSize = 1024;
var
  sAppName: string;
  hSession,
    hURL: HInternet;

  Buffer: array[0..C_BufferSize] of Char;
  BufferLen: DWORD;

  strPageContent: string;
  strTemp: string;

begin
  Result := '';
  sAppName := ExtractFileName(Application.ExeName);
  hSession := InternetOpen(PChar(sAppName), INTERNET_OPEN_TYPE_PRECONFIG, nil,
    nil, 0);
  try
    hURL := InternetOpenURL(hSession, PChar(fileURL), nil, 0, 0, 0);
    try
      strPageContent := '';
      repeat
        InternetReadFile(hURL, @Buffer, SizeOf(Buffer), BufferLen);
        SetString(strTemp, PChar(@buffer), BufferLen div SizeOf(Char));
        strPageContent := strPageContent + strTemp;
      until BufferLen = 0;
      Result := strPageContent;
    finally
      InternetCloseHandle(hURL)
    end
  finally
    InternetCloseHandle(hSession)
  end
end;
A: 

My first thought is to add the correct AcceptEncoding/CharSet header to the request:

e.g:

Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7

Lloyd
+1  A: 

Starting in D2009, String is an alias for UnicodeString. An HTML page, on the other hand, is typically encoded using a multi-byte Ansi encoding instead (usually UTF-8 nowadays, but not always). Your current code will only work if the HTML is encoded as UTF-16, which is very rare. You should not be reading the raw HTML bytes into a UnicodeString directly. You need to first download the entire data into a TBytes, RawByteString, TMemoryStream, or other suitable byte container of your choosing, and then perform an Ansi->Unicode conversion afterwards, based on the charset that is specified in the HTTP "Content-Type" response header.

Try something like this:

function GetInetFileAsString(const fileURL: string): string;
const
  C_BufferSize = 1024;
var
  sAppName: string;
  hSession, hURL: HInternet;
  Buffer: array[0..C_BufferSize] of Byte;
  BufferLen: DWORD;
  strHeader: String;
  strPageContent: TStringStream;
begin
  Result := '';
  sAppName := ExtractFileName(Application.ExeName);
  hSession := InternetOpen(PChar(sAppName), INTERNET_OPEN_TYPE_PRECONFIG, nil, nil, 0);
  try
    strHeader := 'Accept-Charset: utf-8'#13#10;
    hURL := InternetOpenURL(hSession, PChar(fileURL), PChar(strHeader), Length(strHeader), 0, 0);
    try
      strPageContent := TStringStream.Create('', TEncoding.UTF8);
      try
        repeat
          if not InternetReadFile(hURL, @Buffer[0], SizeOf(Buffer), BufferLen) then
            Exit;
          if BufferLen = 0 then
            Break;
          strPageContent.WriteBuffer(Buffer[0], BufferLen);
        until False;
        Result := strPageContent.DataString;
      finally
        strPageContent.Free;
      end;
    finally
      InternetCloseHandle(hURL);
    end
  finally
    InternetCloseHandle(hSession);
  end;
end;
Remy Lebeau - TeamB