I have to scrape a table from a secure site and I'm having trouble logging in to the page and retrieving the authentication token and any other associated cookies. Am I doing something wrong here?
public NameValueCollection LoginToDatrose()
{
var loginUriBuilder = new UriBuilder();
loginUriBuilder.Host = DatroseHostName;
loginUriBuilder.Path = BuildURIPath(DatroseBasePath, LOGIN_PAGE);
loginUriBuilder.Scheme = "https";
var boundary = Guid.NewGuid().ToString();
var postData = new NameValueCollection();
postData.Add("LoginName", DatroseUserName);
postData.Add("Password", DatrosePassword);
var data = Encoding.ASCII.GetBytes(postData.ToQueryString(false));
var request = WebRequest.Create(loginUriBuilder.Uri) as HttpWebRequest;
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = data.Length;
using (var d = request.GetRequestStream())
{
d.Write(data, 0, data.Length);
}
var response = request.GetResponse() as HttpWebResponse;
var responseCookies = new NameValueCollection();
foreach (var nvp in response.Cookies.OfType<Cookie>())
{
responseCookies.Add(nvp.Name, nvp.Value);
}
//using (var responseData = response.GetResponseStream())
//using (var responseReader = new StreamReader(responseData))
//{
// var theResponse = responseReader.ReadToEnd();
// Debug.WriteLine(theResponse);
//}
return responseCookies;
}
I get no values in the return object. It does not fail. The value of theResponse
(when not commented out) seems to be the HTML of the login page.
Any assistance would be greatly appreciated.
OK, the problem here seems related to the 302 redirect that would occur after the credentials were passed. The HttpWebRequest
would automatically follow the 302.
Ultimately, I ended up doing things a little differently. First, I subclassed the WebClient
class as follows:
public class CookiesAwareWebClient : WebClient
{
private CookieContainer outboundCookies = new CookieContainer();
private CookieCollection inboundCookies = new CookieCollection();
public CookieContainer OutboundCookies
{
get
{
return outboundCookies;
}
}
public CookieCollection InboundCookies
{
get
{
return inboundCookies;
}
}
public bool IgnoreRedirects { get; set; }
protected override WebRequest GetWebRequest(Uri address)
{
WebRequest request = base.GetWebRequest(address);
if (request is HttpWebRequest)
{
(request as HttpWebRequest).CookieContainer = outboundCookies;
(request as HttpWebRequest).AllowAutoRedirect = !IgnoreRedirects;
}
return request;
}
protected override WebResponse GetWebResponse(WebRequest request)
{
WebResponse response = base.GetWebResponse(request);
if (response is HttpWebResponse)
{
inboundCookies = (response as HttpWebResponse).Cookies ?? inboundCookies;
}
return response;
}
}
This allowed me to use a WebClient
class that was cookies-aware as well as one that I could control the redirect. Then I rewrote my code for logging in as follows:
public NameValueCollection LoginToDatrose()
{
var loginUriBuilder = new UriBuilder();
loginUriBuilder.Host = DatroseHostName;
loginUriBuilder.Path = BuildURIPath(DatroseBasePath, LOGIN_PAGE);
loginUriBuilder.Scheme = "https";
var postData = new NameValueCollection();
postData.Add("LoginName", DatroseUserName);
postData.Add("Password", DatrosePassword);
var responseCookies = new NameValueCollection();
using (var client = new CookiesAwareWebClient())
{
client.IgnoreRedirects = true;
var clientResponse = client.UploadValues(loginUriBuilder.Uri, "POST", postData);
foreach (var nvp in client.InboundCookies.OfType<Cookie>())
{
responseCookies.Add(nvp.Name, nvp.Value);
}
}
return responseCookies;
}
...and everything worked swimmingly.