Python/urllib

Basic Get
import urllib2 response = urllib2.urlopen('http://python.org/') html = response.read

import urllib2 response = urllib2.urlopen('http://python.org/') if response.code != 200: print "failure"

POST data:
import urllib import urllib2

url = 'http://www.someserver.com/cgi-bin/register.cgi' values = {'name' : 'Michael Foord', 'location' : 'Northampton', 'language' : 'Python' }

data = urllib.urlencode(values) req = urllib2.Request(url, data) response = urllib2.urlopen(req) the_page = response.read

print the_page

Request Headers
... user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0" headers = { 'User-Agent' : user_agent } data = urllib.urlencode(values) req = urllib2.Request(url, data, headers)

# or request = urllib2.Request('http://your.tld/...') request.add_header('User-Agent', 'some fake agent string') request.add_header('Referer', 'fake referrer') ... response = urllib2.urlopen(request)

Response Header
print response.info.getheader('Content-Type')

print response.info.headers # list

response.url # response url

Cookies
import cookielib cj = cookielib.CookieJar opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) req = urllib2.Request(url, postdata, headers) response = opener.open(req)
 * 1) user opener for all future requests

References:
 * HOWTO Fetch Internet Resources Using urllib2 — Python v2.7.6 documentation - https://docs.python.org/2/howto/urllib2.html

Progress Bar
Progress bar: global rem_file # global variable to be used in dlProgress

urllib.urlretrieve(rem_file, loc_file, reporthook=dlProgress)

def dlProgress(count, blockSize, totalSize): percent = int(count*blockSize*100/totalSize) sys.stdout.write("\r" + rem_file + "...%d%%" % percent) sys.stdout.flush


 * http - Python urllib2 Progress Hook - Stack Overflow - http://stackoverflow.com/questions/2028517/python-urllib2-progress-hook
 * How to write a download progress indicator in Python? - Python - K210.ORG - http://k210.org/python/how_to_write_a_download_progress_indicator_in_python/

Read web page 2.x
import urllib resp = urllib.urlopen('http://www.python.org') html = resp.read head = resp.headers headers = head.keys # or 'for header in head' or 'head.items' head['set-cookie']

URL Encode

 * 1) !/usr/bin/env python


 * 1) urlencode a message from stdin or command parameters
 * 2) author: Kenneth Burgener  (c) 2013

import urllib import sys import select

msg = "" if select.select([sys.stdin,],[],[],0.0)[0]: msg = sys.stdin.readline.strip + " " if len(sys.argv) > 1: msg += " ".join(sys.argv[1:]) msg = msg.strip print urllib.quote(msg)

cookies
cookies: import urllib2 req1 = urllib2.Request(url1) response = urllib2.urlopen(req1) cookie = response.headers.get('Set-Cookie') req2 = urllib2.Request(url2) req2.add_header('cookie', cookie) response = urllib2.urlopen(req2)
 * 1) Use the cookie is subsequent requests

Read web page 3.x
import urllib.request resp = urllib.request.urlopen('http://www.python.org') html = resp.read

Convert byte characters to string
print(f.read(100).decode('utf-8'))

Download file: (2.x)
import urllib urllib.urlretrieve(url, filename)

Download file: (3.x)
import urllib.request urllib.request.urlretrieve(url, filename)

Open web browser
import webbrowser webbrowser.open('http://www.google.com')

HTTP Basic Authentication
HTTP Basic Authentication import urllib2, base64 request = urllib2.Request("http://api.foursquare.com/v1/user") base64string = base64.encodestring('%s:%s' % (username, password)).replace('\n', '') request.add_header("Authorization", "Basic %s" % base64string) result = urllib2.urlopen(request)

HTTP Basic Authentication Handler
HTTP Basic Authentication Handler: import urllib2 auth = urllib2.HTTPPasswordMgrWithDefaultRealm auth.add_password(None, 'http://twitter.com/account/', username, password) auth_handler = urllib2.HTTPBasicAuthHandler(auth) url_opener = urllib2.build_opener(auth_handler) urllib2.install_opener(url_opener) url_request = urllib2.Request('https://api.twitter.com/1/statuses/user_timeline.json?%s' % twitter_args) url_output = urllib2.urlopen(url_request).read
 * 1) Note: this will change the default opener.
 * 2)    if you do not wish to do this, simply use "url_opener.open(url_request)" everywhere.

theurl = 'http://10.10.10.135/image.jpg' req = urllib2.Request(theurl) base64string = base64.encodestring('%s:%s' % (username, password))[:-1] authheader = "Basic %s" % base64string req.add_header("Authorization", authheader) resp = urllib2.urlopen(req) img = resp.read

import urllib2 TRIM_API_URL = 'http://api.tr.im/api' auth_handler = urllib2.HTTPBasicAuthHandler auth_handler.add_password(realm='tr.im',   uri=TRIM_API_URL,    user=USERNAME,    passwd=PASSWORD) url_opener = urllib2.build_opener(auth_handler) urllib2.install_opener(url_opener) response = urllib2.urlopen('%s/trim_simple?url=%s'   % (TRIM_API_URL, url_to_trim)) url = response.read.strip
 * 1) the following has the problem of being required to already know the realm

urllib Session Tracking
Sessions Tracking: import urllib import urllib2 import random

print "-" * 10, "REQ1", "-" * 10 data = {'username': 'test', 'password': 'password1', 'id': str(random.randint(1, 100)), } data = urllib.urlencode(data) req1 = urllib2.Request('http://demo.oeey.com/session_a.php', data) req1.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0') print "REQ1 HEADERS:", req1.headers.items print "REQ1 DATA:", req1.data

print "-" * 10, "RESP1", "-" * 10 resp1 = urllib2.urlopen(req1) cookie = resp1.headers.get('Set-Cookie') print 'RESP1 URL:', resp1.geturl # new url, if redirected print 'RESP1 CODE:', resp1.getcode # 200 print 'RESP1 COOKIE:', cookie print 'RESP1 HEADERS:', resp1.headers.items print "RESP1 HTML:\n", resp1.read

print "-" * 10, "REQ2", "-" * 10 data = { 'product': '3', } data = urllib.urlencode(data) req2 = urllib2.Request('http://demo.oeey.com/session_b.php', data) req2.add_header('Cookie', cookie) req2.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0') print "REQ2 HEADERS:", req2.headers.items print "REQ2 DATA:", req2.data

print "-" * 10, "RESP2", "-" * 10 resp2 = urllib2.urlopen(req2) print 'RESP2 URL:', resp2.geturl # new url, if redirected print 'RESP2 CODE:', resp2.getcode # 200 print 'RESP2 HEADERS:', resp2.headers.items print "RESP2 HTML:\n", resp2.read

session_a.php: <?php // Start Session session_start;

// Test redirect - resp1.geturl //$_SESSION['id'] = 'redirect'; //header('Location: http://demo.oeey.com/session_b.php');

// Show REQUEST data\ echo "Request Data: \n"; print_r($_REQUEST);

// $_REQUEST is combination of $_GET, $_POST and $_COOKIE. if(isset($_REQUEST['id'])) { $_SESSION['id'] = $_REQUEST['id']; } else { $_SESSION['id'] = rand; } echo "Request ID: ". $_SESSION['id'];

session_b.php: <?php // Start Session session_start;

// Show REQUEST data echo "Request Data: \n"; print_r($_REQUEST);

// Show session data echo "Session Data: \n"; print_r($_SESSION);

// Show ID echo "ID: \n"; if(isset($_SESSION['id'])) { echo $_SESSION['id']; } else { echo "ID not set"; }

simpler example
url = 'http://apc.oeey.com/login.tgi' values = {'Username' : 'admin', 'Password' : 'admin', } data = urllib.urlencode(values) req = urllib2.Request(url, data) response = urllib2.urlopen(req, timeout=3) cookie = response.headers.get('Set-Cookie') print cookie # 'DLILPC="W5J/nTupJF0hyrv"; Version=1; Path=/'
 * 1) LOGON

url = 'http://apc.oeey.com/outlet?8=OFF' req = urllib2.Request(url) req.add_header('Cookie', cookie) response = urllib2.urlopen(req, timeout=3)
 * 1) POWER OFF

url = 'http://apc.oeey.com/logout' req = urllib2.Request(url) req.add_header('Cookie', cookie) response = urllib2.urlopen(req, timeout=3) cookie = response.headers.get('Set-Cookie') print cookie # 'DLILPC=""; Version=1; Max-Age=0; Path=/'
 * 1) LOGOUT

URL Cookie Session Tracking
import urllib import urllib2 import re

url = 'http://apc.oeey.com/Forms/login1' values = {'login_username' : 'apc', 'login_password' : 'apc', } data = urllib.urlencode(values) req = urllib2.Request(url, data) response = urllib2.urlopen(req, timeout=3)
 * 1) exceptions:
 * 2)   socket.timeout: timed out
 * 3)   urllib2.HTTPError: HTTP Error 403: Forbidden

print response.url # http://apc.oeey.com/NMC/GGcOPeRq8+FWctMifeoezA/home.htm

match = re.findall('http://apc.oeey.com/NMC/(.*)/home.htm', response.url) print match cookie = match[0]
 * 1) get cookie

url2 = 'http://apc.oeey.com/NMC/{}/Forms/outlctrl1'.format(cookie) values2 = { 'rPDUOutletCtrl': '4', 'OL_Cntrl_Col1_Btn': '?8,2', 'submit': 'Next >>'} data2 = urllib.urlencode(values2) req2 = urllib2.Request(url2, data2) response2 = urllib2.urlopen(req2) print response2.url
 * 1) use cookie:

url6 = 'http://apc.oeey.com/NMC/{}/logout.htm'.format(cookie) req6 = urllib2.Request(url6) response6 = urllib2.urlopen(req6) print response6.url # http://apc.oeey.com/NMC/X7tmWWC4oYI0Z4hQbnlLaQ/logout.htm
 * 1) logoff:
 * 1) html = response6.read