I am very new to this so please pardon any ignorance.
I have created my first multi-threaded application and it's purpose is to make numerous webrequests, parse each page source, and store the results in tables for further interrogation. Theoretically there could be as many as 30-40000 requests, therefore the need to multi-thread. Each request gets a thread. I think everything is working except that I very often only get a very partial page source. It's almost as if the StreamReader get interrupted while reading the response. I go to a browser with the same request and get the entire page. I thought it may have to do with threading although I think I am still making calls synchronously. (Ideally, I would like to do the calls asynchronously but I am not sure how to go about that.) Is there a way of knowing if the page source is complete in order to determine whether to request again? I am sure there are complexities here that I am missing. Any help on any of the code would be greatly appreciated.
Sorry about the formatting. Below is part of the code for the class that makes the requests:
using System;
using System.Collections.Generic;
using System.Text;
using System.Data.Sql;
using System.Data.SqlClient;
using System.Threading;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
namespace M4EverCrawler
{
public class DomainRun
{
public void Start()
{
new Thread(new ThreadStart(this.Run1)).Start();
new Thread(new ThreadStart(this.Run2)).Start();
new Thread(new ThreadStart(this.Run3)).Start();
}
public DomainRun(DNQueueManager dnq, ProxyQueueManager prxQ)
{
dnqManager = dnq;
ProxyManager = prxQ;
}
private DNQueueManager dnqManager;
private ProxyQueueManager ProxyManager;
public StagingQueue StagingQueue = new StagingQueue();
public MetricsQueueManager MQmanager = new MetricsQueueManager();
public CommitQueueManager CQmanager = new CommitQueueManager();
protected void Run1()
{
dnqManager.LoadDNs();
ProxyManager.LoadProxies();
while (true)
{
if (dnqManager.IsDNDavailable)
{
DomainData dnd = dnqManager.GetDND();
dnd.PageSource = CapturePage(dnd.DomainName);
StagingQueue.AddDN2Q(dnd);
}
Thread.Sleep(new Random().Next(20));
}
}
protected void Run2()
{
while (true)
{
if (StagingQueue.IsDNDavailable)
{
DomainData dnd = StagingQueue.GetDND();
MaxOutboundLinks = 3;
AvoidHttps = true;
InsideLinks = false;
VerifyBackLinks = true;
MQmanager.AddDN2Q(ParsePage(dnd));
foreach (string link in dnd.Hlinks)
{
DomainData dndLink = new DomainData(dnd.MainSeqno,link.ToString());
dndLink.ParentDomainName = dnd.DomainName;
dnd.PageSource = String.Empty;
MQmanager.AddDN2Q(dndLink);
}
}
Thread.Sleep(new Random().Next(20));
}
}
protected void Run3()
{
while (true)
{
if (MQmanager.IsDNDavailable)
{
DomainData dnd = MQmanager.GetDND();
RunAlexa(dnd);
RunCompete(dnd);
RunQuantcast(dnd);
CQmanager.AddDN2Q(dnd, MQmanager, 1000);
}
Thread.Sleep(new Random().Next(20));
}
}
private string CapturePage(string URIstring)
{
Uri myUri;
try
{
myUri = new Uri(URIstring);
}
catch (Exception URIex)
{
return String.Empty;
}
string proxyIP = ProxyManager.GetCurrentProxy() == "" ? ProxyManager.GetProxy() : ProxyManager.GetCurrentProxy();
int proxCtr = 0;
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(myUri);
WebProxy Proxy = new WebProxy(proxyIP);
request.Proxy = Proxy;
request.Timeout = 20000;
try
{
using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
{
using (StreamReader strmRdr = new StreamReader(response.GetResponseStream(), Encoding.ASCII))
{
return strmRdr.ReadToEnd();
}
}
}
catch (InvalidOperationException Wex)
{
. . .
}
}