I'm experimenting with http://robobrowser.readthedocs.org/en/latest/readme.html, a new python library based on the beautiful soup library. I'm trying to test it out by opening an html page and returning it within a django app, but I can't figure out to do this most simple task. My django app contains :
def index(request):
p=str(request.POST.get('p', False)) # p='https://www.yahoo.com/'
browser = RoboBrowser(history=True)
postedmessage = browser.open(p)
return HttpResponse(postedmessage)
How can I return all the page's HTML?
You can try using the parsed
property.
Code:
from robobrowser import RoboBrowser
url = "http://www.google.com"
br = RoboBrowser(history=True)
br.open(url)
print br.parsed
Result:
<!DOCTYPE html>
<html itemscope="" itemtype="http://schema.org/WebPage" lang="en-PH"><head><meta content="/images/google_favicon_128.png" itemprop="image"/><title>Google</title><script>(function(){
window.google={kEI:"-RFgU9LgJsq6uATKqYGoDg",getEI:function(a){for(var b;a&&(!a.getAttribute||!(b=a.getAttribute("eid")));)a=a.parentNode;return b||google.kEI},https:function(){return"https:"==window.location.protocol},kEXPI:"17259,4000116,4007661,4007830,4008142,4009033,4009641,4010806,4010858,4010899,4011228,4011258,4011679,4012373,4012504,4012860,4013374,4013414,4013591,4013723,4013758,4013787,4013823,4013941,4013967,4013979,4014016,4014033,4014093,4014431,4014515,4014636,4014671,4014810,4014813,4014828,4014991,4015119,4015155,4015234,4015260,4015519,4015550,4015587,4015635,4015638,4015639,4015772,4015901,4016005,4016042,4016127,4016309,4016363,4016367,4016371,4016391,4016425,4016452,4016466,4016479,4016487,4016638,4016703,4016730,4016786,4016800,4016824,4016855,4016933,4016969,4016978,4017042,4017079,4017177,4017202,4017206,4017280,4017285,4017294,4017301,4017336,4017556,8300015,8300017,8500165,8500223,8500240,8500252,8500255,8500283,8500306,8500313,8500325,8500332,8500349,10200002,10200012,10200029,10200038,10200040,10200048,10200053,10200055,10200066,10200083,10200120,10200134,10200136,10200155,10200157,10200159,10200164,10200195,10200211,10200215,10200221,10200224,10200231,10200236,10200242,10200246,10200252",kCSI:{e:"17259,4000116,4007661,4007830,4008142,4009033,4009641,4010806,4010858,4010899,4011228,4011258,4011679,4012373,4012504,4012860,4013374,4013414,4013591,4013723,4013758,4013787,4013823,4013941,4013967,4013979,4014016,4014033,4014093,4014431,4014515,4014636,4014671,4014810,4014813,4014828,4014991,4015119,4015155,4015234,4015260,4015519,4015550,4015587,4015635,4015638,4015639,4015772,4015901,4016005,4016042,4016127,4016309,4016363,4016367,4016371,4016391,4016425,4016452,4016466,4016479,4016487,4016638,4016703,4016730,4016786,4016800,4016824,4016855,4016933,4016969,4016978,4017042,4017079,4017177,4017202,4017206,4017280,4017285,4017294,4017301,4017336,4017556,8300015,8300017,8500165,8500223,8500240,8500252,8500255,8500283,8500306,8500313,8500325,8500332,8500349,10200002,10200012,10200029,10200038,10200040,10200048,10200053,10200055,10200066,10200083,10200120,10200134,10200136,10200155,10200157,10200159,10200164,10200195,10200211,10200215,10200221,10200224,10200231,10200236,10200242,10200246,10200252",ei:"-RFgU9LgJsq6uATKqYGoDg"},authuser:0,ml:function(){},kHL:"en",time:function(){return(new Date).getTime()},log:function(a,b,c,h,k){var d=
new Image,f=google.lc,e=google.li,g="";d.onerror=d.onload=d.onabort=function(){delete f[e]};f[e]=d;c||-1!=b.search("&ei=")||(g="&ei="+google.getEI(h));c=c||"/"+(k||"gen_204")+"?atyp=i&ct="+a+"&cad="+b+g+"&zx="+google.time();a=/^http:/i;a.test(c)&&google.https()?(google.ml(Error("GLMM"),!1,{src:c}),delete f[e]):(d.src=c,google.li=e+1)},lc:[],li:0,y:{},x:function(a,b){google.y[a.id]=[a,b];return!1},load:function(a,b,c){google.x({id:a+l++},function(){google.load(a,b,c)})}};var l=0;})();
(function(){google.sn="webhp";google.timers={};google.startTick=function(a,b){var f=google.time();google.timers[a]={t:{start:f},bfr:!!b};};google.tick=function(a,b,f){google.timers[a]||google.startTick(a);google.timers[a].t[b]=f||google.time()};google.startTick("load",!0);
try{}catch(d){}})();
var _gjwl=location;function _gjuc(){var a=_gjwl.href.indexOf("#");if(0<=a&&(a=_gjwl.href.substring(a),0<a.indexOf("&q=")||0<=a.indexOf("#q="))&&(a=a.substring(1),-1==a.indexOf("#"))){for(var d=0;d<a.length;){var b=d;"&"==a.charAt(b)&&++b;var c=a.indexOf("&",b);-1==c&&(c=a.length);b=a.substring(b,c);if(0==b.indexOf("fp="))a=a.substring(0,d)+a.substring(c,a.length),c=d;else if("cad=h"==b)return 0;d=c}_gjwl.href="/search?"+a+"&cad=h";return 1}return 0}
function _gjh(){!_gjuc()&&window.google&&google.x&&google.x({id:"GJH"},function(){google.nav&&google.nav.gjh&&google.nav.gjh()})};
window._gjh&&_gjh();</script><style>#gbar,#guser{font-size:13px;padding-top:1px !important;}#gbar{height:22px}#guser{padding-bottom:7px !important;text-align:right}.gbh,.gbd{border-top:1px solid #c9d7f1;font-size:1px}.gbh{height:0;position:absolute;top:24px;width:100%}@media all{.gb1{height:22px;margin-right:.5em;vertical-align:top}#gbar{float:left}}a.gb1,a.gb4{text-decoration:underline !important}a.gb1,a.gb4{color:#00c !important}.gbi .gb4{color:#dd8e27 !important}.gbf .gb4{color:#900 !important}</style><style>body,td,a,p,.h{font-family:arial,sans-serif}body{margin:0;overflow-y:scroll}#gog{padding:3px 8px 0}td{line-height:.8em}.gac_m td{line-height:17px}form{margin-bottom:20px}.h{color:#36c}.q{color:#00c}.ts td{padding:0}.ts{border-collapse:collapse}em{font-weight:bold;font-style:normal}.lst{height:25px;width:496px}.gsfi,.lst{font:18px arial,sans-serif}.gsfs{font:17px arial,sans-serif}.ds{display:inline-box;display:inline-block;margin:3px 0 4px;margin-left:4px}input{font-family:inherit}a.gb1,a.gb2,a.gb3,a.gb4{color:#11c !important}body{background:#fff;color:black}a{color:#11c;text-decoration:none}a:hover,a:active{text-decoration:underline}.fl a{color:#36c}a:visited{color:#551a8b}a.gb1,a.gb4{text-decoration:underline}a.gb3:hover{text-decoration:none}#ghead a.gb2:hover{color:#fff !important}.sblc{padding-top:5px}.sblc a{display:block;margin:2px 0;margin-left:13px;font-size:11px}.lsbb{background:#eee;border:solid 1px;border-color:#ccc #999 #999 #ccc;height:30px}.lsbb{display:block}.ftl,#fll a{display:inline-block;margin:0 12px}.lsb{background:url(/images/srpr/nav_logo80.png) 0 -258px repeat-x;border:none;color:#000;cursor:pointer;height:30px;margin:0;outline:0;font:15px arial,sans-serif;vertical-align:top}.lsb:active{background:#ccc}.lst:focus{outline:none}#addlang a{padding:0 3px}</style><script></script></head><body bgcolor="#fff"><script>(function(){var src='/images/nav_logo176.png';var iesg=false;document.body.onload = function(){window.n && window.n();if (document.images){new Image().src=src;}
if (!iesg){document.f&&document.f.q.focus();document.gbqf&&document.gbqf.q.focus();}
}
})();</script><textarea id="csi" style="display:none"></textarea><div id="mngb"> <div id="gbar"><nobr><b class="gb1">Search</b> <a class="gb1" href="http://www.google.com.ph/imghp?hl=en&tab=wi">Images</a> <a class="gb1" href="http://maps.google.com.ph/maps?hl=en&tab=wl">Maps</a> <a class="gb1" href="https://play.google.com/?hl=en&tab=w8">Play</a> <a class="gb1" href="http://www.youtube.com/?gl=PH&tab=w1">YouTube</a> <a class="gb1" href="http://news.google.com.ph/nwshp?hl=en&tab=wn">News</a> <a class="gb1" href="https://mail.google.com/mail/?tab=wm">Gmail</a> <a class="gb1" href="https://drive.google.com/?tab=wo">Drive</a> <a class="gb1" href="http://www.google.com.ph/intl/en/options/" style="text-decoration:none"><u>More</u> »</a></nobr></div><div id="guser" width="100%"><nobr><span class="gbi" id="gbn"></span><span class="gbf" id="gbf"></span><span id="gbe"></span><a class="gb4" href="http://www.google.com.ph/history/optout?hl=en">Web History</a> | <a class="gb4" href="/preferences?hl=en">Settings</a> | <a class="gb4" href="https://accounts.google.com/ServiceLogin?hl=en&continue=http://www.google.com.ph/%3Fgfe_rd%3Dcr%26ei%3D-BFgU62INOmNiAeYnYDoAg" id="gb_70" target="_top">Sign in</a></nobr></div><div class="gbh" style="left:0"></div><div class="gbh" style="right:0"></div> </div><center><br clear="all" id="lgpd"/><div id="lga"><div style="padding:28px 0 3px"><div align="left" id="hplogo" onload="window.lol&&lol()" style="height:110px;width:276px;background:url(/images/srpr/logo9w.png) no-repeat" title="Google"><div nowrap="" style="color:#777;font-size:16px;font-weight:bold;position:relative;top:70px;left:218px">Philippines</div></div></div><br/></div><form action="/search" name="f"><table cellpadding="0" cellspacing="0"><tr valign="top"><td width="25%"> </td><td align="center" nowrap=""><input name="ie" type="hidden" value="ISO-8859-1"/><input name="hl" type="hidden" value="en-PH"/><input name="source" type="hidden" value="hp"/><div class="ds" style="height:32px;margin:4px 0"><input autocomplete="off" class="lst" maxlength="2048" name="q" size="57" style="color:#000;margin:0;padding:5px 8px 0 6px;vertical-align:top" title="Google Search" value=""/></div><br style="line-height:0"/><span class="ds"><span class="lsbb"><input class="lsb" name="btnG" type="submit" value="Google Search"/></span></span><span class="ds"><span class="lsbb"><input class="lsb" name="btnI" onclick="if(this.form.q.value)this.checked=1; else top.location='/doodles/'" type="submit" value="I'm Feeling Lucky"/></span></span></td><td align="left" class="fl sblc" nowrap="" width="25%"><a href="/advanced_search?hl=en-PH&authuser=0">Advanced search</a><a href="/language_tools?hl=en-PH&authuser=0">Language tools</a></td></tr></table><input id="gbv" name="gbv" type="hidden" value="1"/></form><div id="gac_scont"></div><div style="font-size:83%;min-height:3.5em"><br/><div id="als"><font id="addlang" size="-1">Google.com.ph offered in: <a href="http://www.google.com.ph/setprefs?sig=0_SbhOaVIheKTw2jFHRcEg8o-Evng%3D&hl=tl&source=homepage">Filipino</a> <a href="http://www.google.com.ph/setprefs?sig=0_SbhOaVIheKTw2jFHRcEg8o-Evng%3D&hl=ceb&source=homepage">Cebuano</a></font><br/><br/></div></div><span id="footer"><div style="font-size:10pt"><div id="fll" style="margin:19px auto;text-align:center"><a href="/intl/en/ads/">Advertising Programs</a><a href="http://www.google.com.ph/intl/en/services/">Business Solutions</a><a href="/intl/en/about.html">About Google</a><a href="http://www.google.com.ph/setprefdomain?prefdom=US&sig=0_dQ2pwXFotFQfDlj9qmDCkzdxCdA%3D" id="fehl">Google.com</a></div></div><p style="color:#767676;font-size:8pt">© 2013 - <a href="/intl/en/policies/">Privacy & Terms</a></p></span></center><div id="xjsd"></div><div data-jiis="bp" id="xjsi"><script>if(google.y)google.y.first=[];(function(){function b(a){window.setTimeout(function(){var c=document.createElement("script");c.src=a;document.getElementById("xjsd").appendChild(c)},0)}google.dljp=function(a){google.xjsu=a;b(a)};google.dlj=b;})();
if(!google.xjs){window._=window._||{};window._._DumpException=function(e){throw e};if(google.timers&&google.timers.load.t){google.timers.load.t.xjsls=new Date().getTime();}google.dljp('/xjs/_/js/k\x3dxjs.hp.en_US.RLLpSOAzMFM.O/m\x3dsb_he,pcc/rt\x3dj/d\x3d1/sv\x3d1/rs\x3dAItRSTOBXfxSyWrXjOGBi9e9cIs5cEBO6A');google.xjs=1;}google.pmc={"sb_he":{"agen":true,"cgen":true,"client":"heirloom-hp","dh":true,"ds":"","eqch":true,"fl":true,"host":"google.com.ph","jam":0,"jsonp":true,"msgs":{"cibl":"Clear Search","dym":"Did you mean:","lcky":"I\u0026#39;m Feeling Lucky","lml":"Learn more","oskt":"Input tools","psrc":"This search was removed from your \u003Ca href=\"/history\"\u003EWeb History\u003C/a\u003E","psrl":"Remove","sbit":"Search by image","srch":"Google Search"},"ovr":{},"pq":"","qcpw":false,"scd":10,"sce":5,"stok":"QGTPqfOgiEZ_AI3e5vphR6-NOmw"},"pcc":{}};google.y.first.push(function(){if(google.med){google.med('init');google.initHistory();google.med('history');}});if(google.j&&google.j.en&&google.j.xi){window.setTimeout(google.j.xi,0);}</script></div><script>(function(){if(google.timers&&google.timers.load.t){var b,c,d,e,g=function(a,f){a.removeEventListener?(a.removeEventListener("load",f,!1),a.removeEventListener("error",f,!1)):(a.detachEvent("onload",f),a.detachEvent("onerror",f))},h=function(a){e=(new Date).getTime();++c;a=a||window.event;a=a.target||a.srcElement;g(a,h)},k=document.getElementsByTagName("img");b=k.length;for(var l=c=0,m;l<b;++l)m=k[l],m.complete||"string"!=typeof m.src||!m.src?++c:m.addEventListener?(m.addEventListener("load",h,!1),m.addEventListener("error",
h,!1)):(m.attachEvent("onload",h),m.attachEvent("onerror",h));d=b-c;var n=function(){if(google.timers.load.t){google.timers.load.t.ol=(new Date).getTime();google.timers.load.t.iml=e;google.kCSI.imc=c;google.kCSI.imn=b;google.kCSI.imp=d;void 0!==google.stt&&(google.kCSI.stt=google.stt);google.csiReport&&google.csiReport()}};window.addEventListener?window.addEventListener("load",n,!1):window.attachEvent&&
window.attachEvent("onload",n);google.timers.load.t.prt=e=(new Date).getTime()};})();
</script></body></html>
[Finished in 4.3s]
How it scrapes is up to you, though. In any case, if you're having difficulties getting to know new libraries such as this, always exhaust the documentation.
From the site itself:
parsed
Lazily parse response content, using HTML parser specified by the browser.
Or you can use dir
on br
and get the following:
['__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_build_url', '_cursor', '_maxlen', '_states', '_traverse', '_update_state', 'back', 'find', 'find_all', 'follow_link', 'forward', 'get_form', 'get_forms', 'get_link', 'get_links', 'history', 'open', 'parsed', 'parser', 'response', 'select', 'session', 'state', 'submit_form', 'timeout', 'url']
[Finished in 5.3s]
As you can see, towards the end, parsed
is shown.
Hope this helps.