How to scrape a website that requires login first with Python

user2830451 picture user2830451 · Nov 18, 2013 · Viewed 84.6k times · Source

First of all, I think it's worth saying that, I know there are a bunch of similar questions but NONE of them works for me...

I'm a newbie on Python, html and web scraper. I'm trying to scrape user information from a website which needs to login first. In my tests I use scraper my email settings from github as examples. The main page is 'https://github.com/login' and the target page is 'https://github.com/settings/emails'

Here are a list of methods I've tried

##################################### Method 1
import mechanize
import cookielib
from BeautifulSoup import BeautifulSoup
import html2text

br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)

# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)


br.addheaders = [('User-agent', 'Chrome')]

# The site we will navigate into, handling it's session
br.open('https://github.com/login')

for f in br.forms():
    print f

br.select_form(nr=0)

# User credentials
br.form['login'] = 'myusername'
br.form['password'] = 'mypwd'

# Login
br.submit()

br.open('github.com/settings/emails').read()


################ Method 2
import urllib, urllib2, cookielib

username = 'myusername'
password = 'mypwd'

cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
login_data = urllib.urlencode({'username' : username, 'j_password' : password})
opener.open('https://github.com/login', login_data)
resp = opener.open('https://github.com/settings/emails')
print resp.read()



############# Method 3
import urllib
opener = urllib.FancyURLopener()
print opener.open('http://myusername:[email protected]/settings/emails').read()




########## Method 4
import mechanize
import cookielib

br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)

br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)

br.addheaders = [('User-agent', 'Chrome')]

br.add_password('https://github.com/settings/emails', 'myusername', 'mypwd')
br.open('https://github.com/settings/emails')
print br.response().read()



############ Methods 5
from requests import session

payload = {
    'action': 'login',
    'username': 'myusername',
    'password': 'mypwd'
}

with session() as c:
    c.post('https://github.com/login', data=payload)
    request = c.get('https://github.com/settings/emails')
    print request.headers
    print request.text



########### Method 6
import requests
from requests.packages.urllib3 import add_stderr_logger
import sys
from bs4 import BeautifulSoup as bs

add_stderr_logger()
s = requests.Session()

s.headers['User-Agent'] = 'Chrome'

username = 'myusername'
password = 'mypwd'
url = 'https://github.com/login'

# after examining the HTML of the website you're trying to log into
# set name_form to the name of the form element that contains the name and
# set password_form to the name of the form element that will contain the password
login = {'login': username, 'password': password}
login_response = s.post(url, data=login)
for r in login_response.history:
    if r.status_code == 401:  # 401 means authentication failed
        print 'error!'
        sys.exit(1)  # abort


pdf_response = s.get('https://github.com/settings/emails')  # Your cookies and headers are automatically included
soup = bs(pdf_response.content)

Also I've read some discussions about differences between HTTP Authentication and cookies. Still none of them worked.

Please help and any help would be appreciated. Thank you very much.

Answer

Holy Mackerel picture Holy Mackerel · Nov 18, 2013

This works for me:

##################################### Method 1
import mechanize
import cookielib
from BeautifulSoup import BeautifulSoup
import html2text

# Browser
br = mechanize.Browser()

# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)

# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

br.addheaders = [('User-agent', 'Chrome')]

# The site we will navigate into, handling it's session
br.open('https://github.com/login')

# View available forms
for f in br.forms():
    print f

# Select the second (index one) form (the first form is a search query box)
br.select_form(nr=1)

# User credentials
br.form['login'] = 'mylogin'
br.form['password'] = 'mypass'

# Login
br.submit()

print(br.open('https://github.com/settings/emails').read())

You were not far off at all!