Python/urllib

The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

urllib

Basic Get

import urllib2
response = urllib2.urlopen('http://python.org/')
html = response.read()

import urllib2
response = urllib2.urlopen('http://python.org/')
if response.code != 200:
  print "failure"

POST data:

import urllib
import urllib2

url = 'http://www.someserver.com/cgi-bin/register.cgi'
values = {'name' : 'Michael Foord',
          'location' : 'Northampton',
          'language' : 'Python' }

data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
the_page = response.read()

print the_page

Request Headers

...
user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0"
headers = { 'User-Agent' : user_agent }
data = urllib.urlencode(values)
req = urllib2.Request(url, data, headers)

# or
request = urllib2.Request('http://your.tld/...')
request.add_header('User-Agent', 'some fake agent string')
request.add_header('Referer', 'fake referrer')
...
response = urllib2.urlopen(request)

Response Header

print response.info().getheader('Content-Type')

print response.info().headers  # list

response.url  # response url

Cookies

import cookielib
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
# user opener for all future requests
req = urllib2.Request(url, postdata, headers)
response = opener.open(req)

References:

HOWTO Fetch Internet Resources Using urllib2 — Python v2.7.6 documentation - https://docs.python.org/2/howto/urllib2.html

Progress Bar

Progress bar: [1]

global rem_file # global variable to be used in dlProgress

urllib.urlretrieve(rem_file, loc_file, reporthook=dlProgress)

def dlProgress(count, blockSize, totalSize):
    percent = int(count*blockSize*100/totalSize)
    sys.stdout.write("\r" + rem_file + "...%d%%" % percent)
    sys.stdout.flush()

http - Python urllib2 Progress Hook - Stack Overflow - http://stackoverflow.com/questions/2028517/python-urllib2-progress-hook
How to write a download progress indicator in Python? - Python - K210.ORG - http://k210.org/python/how_to_write_a_download_progress_indicator_in_python/

Read web page 2.x

import urllib
resp = urllib.urlopen('http://www.python.org')
html = resp.read()
head = resp.headers
headers = head.keys()  # or 'for header in head' or 'head.items()'
head['set-cookie']

URL Encode

#!/usr/bin/env python

# urlencode a message from stdin or command parameters
# author: Kenneth Burgener <kenneth@k.ttak.org> (c) 2013

import urllib
import sys
import select

msg = ""
if select.select([sys.stdin,],[],[],0.0)[0]:
    msg = sys.stdin.readline().strip() + " "
if len(sys.argv) > 1:
    msg += " ".join(sys.argv[1:])
msg = msg.strip()
print urllib.quote(msg)

cookies

cookies: [2]

import urllib2
req1 = urllib2.Request(url1)
response = urllib2.urlopen(req1)
cookie = response.headers.get('Set-Cookie')
# Use the cookie is subsequent requests
req2 = urllib2.Request(url2)
req2.add_header('cookie', cookie)
response = urllib2.urlopen(req2)

Read web page 3.x

import urllib.request
resp = urllib.request.urlopen('http://www.python.org')
html = resp.read()

Convert byte characters to string

print(f.read(100).decode('utf-8'))

Download File

Download file: (2.x)

import urllib
urllib.urlretrieve(url, filename)

Download file: (3.x)

import urllib.request
urllib.request.urlretrieve(url, filename)

Open web browser

import webbrowser
webbrowser.open('http://www.google.com')

HTTP Basic Authentication

HTTP Basic Authentication [3]

import urllib2, base64
request = urllib2.Request("http://api.foursquare.com/v1/user")
base64string = base64.encodestring('%s:%s' % (username, password)).replace('\n', )
request.add_header("Authorization", "Basic %s" % base64string)   
result = urllib2.urlopen(request)

HTTP Basic Authentication Handler

HTTP Basic Authentication Handler: [4] [5] [6]

import urllib2
auth = urllib2.HTTPPasswordMgrWithDefaultRealm()
auth.add_password(None, 'http://twitter.com/account/', username, password)
auth_handler = urllib2.HTTPBasicAuthHandler(auth)
url_opener = urllib2.build_opener(auth_handler)
# Note: this will change the default opener.
#    if you do not wish to do this, simply use "url_opener.open(url_request)" everywhere.
urllib2.install_opener(url_opener)
url_request = urllib2.Request('https://api.twitter.com/1/statuses/user_timeline.json?%s' % twitter_args)
url_output = urllib2.urlopen(url_request).read()

theurl = 'http://10.10.10.135/image.jpg'
req = urllib2.Request(theurl)
base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
authheader =  "Basic %s" % base64string
req.add_header("Authorization", authheader)
resp = urllib2.urlopen(req)
img = resp.read()

# the following has the problem of being required to already know the realm
import urllib2
TRIM_API_URL = 'http://api.tr.im/api'
auth_handler = urllib2.HTTPBasicAuthHandler()
auth_handler.add_password(realm='tr.im',
    uri=TRIM_API_URL,
    user=USERNAME,
    passwd=PASSWORD)
url_opener = urllib2.build_opener(auth_handler)
urllib2.install_opener(url_opener)
response = urllib2.urlopen('%s/trim_simple?url=%s'
    % (TRIM_API_URL, url_to_trim))
url = response.read().strip()

urllib Session Tracking

Sessions Tracking:

import urllib
import urllib2
import random

print "-" * 10, "REQ1", "-" * 10
data = {'username': 'test',
        'password': 'password1',
        'id': str(random.randint(1, 100)),
        }
data = urllib.urlencode(data)
req1 = urllib2.Request('http://demo.oeey.com/session_a.php', data)
req1.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0')
print "REQ1 HEADERS:", req1.headers.items()
print "REQ1 DATA:", req1.data

print "-" * 10, "RESP1", "-" * 10
resp1 = urllib2.urlopen(req1)
cookie = resp1.headers.get('Set-Cookie')
print 'RESP1 URL:', resp1.geturl()  # new url, if redirected
print 'RESP1 CODE:', resp1.getcode()  # 200
print 'RESP1 COOKIE:', cookie
print 'RESP1 HEADERS:', resp1.headers.items()
print "RESP1 HTML:\n", resp1.read()

print "-" * 10, "REQ2", "-" * 10
data = {
    'product': '3',
        }
data = urllib.urlencode(data)
req2 = urllib2.Request('http://demo.oeey.com/session_b.php', data)
req2.add_header('Cookie', cookie)
req2.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0')
print "REQ2 HEADERS:", req2.headers.items()
print "REQ2 DATA:", req2.data

print "-" * 10, "RESP2", "-" * 10
resp2 = urllib2.urlopen(req2)
print 'RESP2 URL:', resp2.geturl()  # new url, if redirected
print 'RESP2 CODE:', resp2.getcode()  # 200
print 'RESP2 HEADERS:', resp2.headers.items()
print "RESP2 HTML:\n", resp2.read()

session_a.php:

<?php
// Start Session
session_start();

// Test redirect - resp1.geturl()
//$_SESSION['id'] = 'redirect';
//header('Location: http://demo.oeey.com/session_b.php');

// Show REQUEST data\
echo "Request Data: \n";
print_r($_REQUEST);

// $_REQUEST is combination of $_GET, $_POST and $_COOKIE.
if(isset($_REQUEST['id'])) {
    $_SESSION['id'] = $_REQUEST['id'];
} else {
    $_SESSION['id'] = rand();
}
echo "Request ID: " . $_SESSION['id'];

session_b.php:

<?php
// Start Session
session_start();

// Show REQUEST data
echo "Request Data: \n";
print_r($_REQUEST);

// Show session data
echo "Session Data: \n";
print_r($_SESSION);

// Show ID
echo "ID: \n";
if(isset($_SESSION['id'])) {
    echo $_SESSION['id'];
} else {
    echo "ID not set";
}

simpler example

### LOGON
url = 'http://apc.oeey.com/login.tgi'
values = {'Username' : 'admin',
          'Password' : 'admin', }
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req, timeout=3)
cookie = response.headers.get('Set-Cookie')
print cookie  # 'DLILPC="W5J/nTupJF0hyrv"; Version=1; Path=/'

### POWER OFF
url = 'http://apc.oeey.com/outlet?8=OFF'
req = urllib2.Request(url)
req.add_header('Cookie', cookie)
response = urllib2.urlopen(req, timeout=3)

### LOGOUT
url = 'http://apc.oeey.com/logout'
req = urllib2.Request(url)
req.add_header('Cookie', cookie)
response = urllib2.urlopen(req, timeout=3)
cookie = response.headers.get('Set-Cookie')
print cookie  # 'DLILPC=""; Version=1; Max-Age=0; Path=/'

URL Cookie Session Tracking

import urllib
import urllib2
import re

url = 'http://apc.oeey.com/Forms/login1'
values = {'login_username' : 'apc',
          'login_password' : 'apc', }
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req, timeout=3)
# exceptions:
#   socket.timeout: timed out
#   urllib2.HTTPError: HTTP Error 403: Forbidden

print response.url  # http://apc.oeey.com/NMC/GGcOPeRq8+FWctMifeoezA/home.htm

# get cookie
match = re.findall('http://apc.oeey.com/NMC/(.*)/home.htm', response.url)
print match
cookie = match[0]

# use cookie:
url2 = 'http://apc.oeey.com/NMC/{}/Forms/outlctrl1'.format(cookie)
values2 = { 'rPDUOutletCtrl': '4',
            'OL_Cntrl_Col1_Btn': '?8,2',
            'submit': 'Next >>'}
data2 = urllib.urlencode(values2)
req2 = urllib2.Request(url2, data2)
response2 = urllib2.urlopen(req2)
print response2.url

# logoff:
url6 = 'http://apc.oeey.com/NMC/{}/logout.htm'.format(cookie)
req6 = urllib2.Request(url6)
response6 = urllib2.urlopen(req6)
print response6.url  # http://apc.oeey.com/NMC/X7tmWWC4oYI0Z4hQbnlLaQ/logout.htm
#html = response6.read()

Python/urllib

Contents

urllib

Basic Get

POST data:

Request Headers

Response Header

Cookies

Progress Bar

Read web page 2.x

URL Encode

cookies

Read web page 3.x

Convert byte characters to string

Download File

Download file: (2.x)

Download file: (3.x)

Open web browser

HTTP Basic Authentication

HTTP Basic Authentication Handler

urllib Session Tracking

simpler example

URL Cookie Session Tracking

Navigation menu

Python/urllib

urllib

Basic Get

POST data:

Request Headers

Response Header

Cookies

Progress Bar

Read web page 2.x

URL Encode

cookies

Read web page 3.x

Convert byte characters to string

Download File

Download file: (2.x)

Download file: (3.x)

Open web browser

HTTP Basic Authentication

HTTP Basic Authentication Handler

urllib Session Tracking

simpler example

URL Cookie Session Tracking

Navigation menu

Search