



Using the following code, I can download the HTML of a file from the internet:

WebClient wc = new WebClient();

// ....

string downloadedFile = wc.DownloadString("");

However, sometimes the file contains "interesting" characters like é to é, to ↠and フシギダネ to フシギダãƒ.

I think it may be something to do with different unicode types or something, as each character gets changed into 2 new ones, perhaps each character being split in half but I have very little knowledge in this area. What do you think is wrong?

+6  A: 

Here's a wrapped download class which supports gzip and checks encoding header and meta tags in order to decode it correctly.

Instantiate the class, and call GetPage().

public class HttpDownloader
    private readonly string _referer;
    private readonly string _userAgent;

    public Encoding Encoding { get; set; }
    public WebHeaderCollection Headers { get; set; }
    public Uri Url { get; set; }

    public HttpDownloader(string url, string referer, string userAgent)
        Encoding = Encoding.GetEncoding("ISO-8859-1");
        Url = new Uri(url); // verify the uri
        _userAgent = userAgent;
        _referer = referer;

    public string GetPage()
        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
        if (!string.IsNullOrEmpty(_referer))
            request.Referer = _referer;
        if (!string.IsNullOrEmpty(_userAgent))
            request.UserAgent = _userAgent;

        request.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip,deflate");

        using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
            Headers = response.Headers;
            Url = response.ResponseUri;
            return ProcessContent(response);


    private string ProcessContent(HttpWebResponse response)

        Stream s = response.GetResponseStream();
        if (response.ContentEncoding.ToLower().Contains("gzip"))
            s = new GZipStream(s, CompressionMode.Decompress);
        else if (response.ContentEncoding.ToLower().Contains("deflate"))
            s = new DeflateStream(s, CompressionMode.Decompress);  

        MemoryStream memStream = new MemoryStream();
        int bytesRead;
        byte[] buffer = new byte[0x1000];
        for (bytesRead = s.Read(buffer, 0, buffer.Length); bytesRead > 0; bytesRead = s.Read(buffer, 0, buffer.Length))
            memStream.Write(buffer, 0, bytesRead);
        string html;
        memStream.Position = 0;
        using (StreamReader r = new StreamReader(memStream, Encoding))
            html = r.ReadToEnd().Trim();
            html = CheckMetaCharSetAndReEncode(memStream, html);

        return html;

    private void SetEncodingFromHeader(HttpWebResponse response)
        string charset = null;
        if (string.IsNullOrEmpty(response.CharacterSet))
            Match m = Regex.Match(response.ContentType, @";\s*charset\s*=\s*(?<charset>.*)", RegexOptions.IgnoreCase);
            if (m.Success)
                charset = m.Groups["charset"].Value.Trim(new[] { '\'', '"' });
            charset = response.CharacterSet;
        if (!string.IsNullOrEmpty(charset))
                Encoding = Encoding.GetEncoding(charset);
            catch (ArgumentException)

    private string CheckMetaCharSetAndReEncode(Stream memStream, string html)
        Match m = new Regex(@"<meta\s+.*?charset\s*=\s*(?<charset>[A-Za-z0-9_-]+)", RegexOptions.Singleline | RegexOptions.IgnoreCase).Match(html);
        if (m.Success)
            string charset = m.Groups["charset"].Value.ToLower() ?? "iso-8859-1";
            if ((charset == "unicode") || (charset == "utf-16"))
                charset = "utf-8";

                Encoding metaEncoding = Encoding.GetEncoding(charset);
                if (Encoding != metaEncoding)
                    memStream.Position = 0L;
                    StreamReader recodeReader = new StreamReader(memStream, metaEncoding);
                    html = recodeReader.ReadToEnd().Trim();
            catch (ArgumentException)

        return html;
Mikael Svenson
Hey, that just works. Thanks.
Callum Rogers
Something I wrote last year for an azure project :) Glad it could be of use for you.
Mikael Svenson