Python/urllib

From Omnia
Jump to navigation Jump to search
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

urllib

Basic Get

import urllib2
response = urllib2.urlopen('http://python.org/')
html = response.read()
import urllib2
response = urllib2.urlopen('http://python.org/')
if response.code != 200:
  print "failure"

POST data:

import urllib
import urllib2

url = 'http://www.someserver.com/cgi-bin/register.cgi'
values = {'name' : 'Michael Foord',
          'location' : 'Northampton',
          'language' : 'Python' }

data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
the_page = response.read()

print the_page

Request Headers

...
user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0"
headers = { 'User-Agent' : user_agent }
data = urllib.urlencode(values)
req = urllib2.Request(url, data, headers)
# or
request = urllib2.Request('http://your.tld/...')
request.add_header('User-Agent', 'some fake agent string')
request.add_header('Referer', 'fake referrer')
...
response = urllib2.urlopen(request)

Response Header

print response.info().getheader('Content-Type')
print response.info().headers  # list
response.url  # response url

Cookies

import cookielib
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
# user opener for all future requests
req = urllib2.Request(url, postdata, headers)
response = opener.open(req)

References:

Progress Bar

Progress bar: [1]

global rem_file # global variable to be used in dlProgress

urllib.urlretrieve(rem_file, loc_file, reporthook=dlProgress)

def dlProgress(count, blockSize, totalSize):
    percent = int(count*blockSize*100/totalSize)
    sys.stdout.write("\r" + rem_file + "...%d%%" % percent)
    sys.stdout.flush()


Read web page 2.x

import urllib
resp = urllib.urlopen('http://www.python.org')
html = resp.read()
head = resp.headers
headers = head.keys()  # or 'for header in head' or 'head.items()'
head['set-cookie']

URL Encode

#!/usr/bin/env python

# urlencode a message from stdin or command parameters
# author: Kenneth Burgener <kenneth@k.ttak.org> (c) 2013

import urllib
import sys
import select

msg = ""
if select.select([sys.stdin,],[],[],0.0)[0]:
    msg = sys.stdin.readline().strip() + " "
if len(sys.argv) > 1:
    msg += " ".join(sys.argv[1:])
msg = msg.strip()
print urllib.quote(msg)

cookies

cookies: [2]

import urllib2
req1 = urllib2.Request(url1)
response = urllib2.urlopen(req1)
cookie = response.headers.get('Set-Cookie')
# Use the cookie is subsequent requests
req2 = urllib2.Request(url2)
req2.add_header('cookie', cookie)
response = urllib2.urlopen(req2)

Read web page 3.x

import urllib.request
resp = urllib.request.urlopen('http://www.python.org')
html = resp.read()

Convert byte characters to string

print(f.read(100).decode('utf-8'))

Download File

Download file: (2.x)

import urllib
urllib.urlretrieve(url, filename)

Download file: (3.x)

import urllib.request
urllib.request.urlretrieve(url, filename)

Open web browser

import webbrowser
webbrowser.open('http://www.google.com')

HTTP Basic Authentication

HTTP Basic Authentication [3]

import urllib2, base64
request = urllib2.Request("http://api.foursquare.com/v1/user")
base64string = base64.encodestring('%s:%s' % (username, password)).replace('\n', )
request.add_header("Authorization", "Basic %s" % base64string)   
result = urllib2.urlopen(request)

HTTP Basic Authentication Handler

HTTP Basic Authentication Handler: [4] [5] [6]

import urllib2
auth = urllib2.HTTPPasswordMgrWithDefaultRealm()
auth.add_password(None, 'http://twitter.com/account/', username, password)
auth_handler = urllib2.HTTPBasicAuthHandler(auth)
url_opener = urllib2.build_opener(auth_handler)
# Note: this will change the default opener.
#    if you do not wish to do this, simply use "url_opener.open(url_request)" everywhere.
urllib2.install_opener(url_opener)
url_request = urllib2.Request('https://api.twitter.com/1/statuses/user_timeline.json?%s' % twitter_args)
url_output = urllib2.urlopen(url_request).read()
theurl = 'http://10.10.10.135/image.jpg'
req = urllib2.Request(theurl)
base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
authheader =  "Basic %s" % base64string
req.add_header("Authorization", authheader)
resp = urllib2.urlopen(req)
img = resp.read()
# the following has the problem of being required to already know the realm
import urllib2
TRIM_API_URL = 'http://api.tr.im/api'
auth_handler = urllib2.HTTPBasicAuthHandler()
auth_handler.add_password(realm='tr.im',
    uri=TRIM_API_URL,
    user=USERNAME,
    passwd=PASSWORD)
url_opener = urllib2.build_opener(auth_handler)
urllib2.install_opener(url_opener)
response = urllib2.urlopen('%s/trim_simple?url=%s'
    % (TRIM_API_URL, url_to_trim))
url = response.read().strip()

urllib Session Tracking

Sessions Tracking:

import urllib
import urllib2
import random

print "-" * 10, "REQ1", "-" * 10
data = {'username': 'test',
        'password': 'password1',
        'id': str(random.randint(1, 100)),
        }
data = urllib.urlencode(data)
req1 = urllib2.Request('http://demo.oeey.com/session_a.php', data)
req1.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0')
print "REQ1 HEADERS:", req1.headers.items()
print "REQ1 DATA:", req1.data

print "-" * 10, "RESP1", "-" * 10
resp1 = urllib2.urlopen(req1)
cookie = resp1.headers.get('Set-Cookie')
print 'RESP1 URL:', resp1.geturl()  # new url, if redirected
print 'RESP1 CODE:', resp1.getcode()  # 200
print 'RESP1 COOKIE:', cookie
print 'RESP1 HEADERS:', resp1.headers.items()
print "RESP1 HTML:\n", resp1.read()

print "-" * 10, "REQ2", "-" * 10
data = {
    'product': '3',
        }
data = urllib.urlencode(data)
req2 = urllib2.Request('http://demo.oeey.com/session_b.php', data)
req2.add_header('Cookie', cookie)
req2.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0')
print "REQ2 HEADERS:", req2.headers.items()
print "REQ2 DATA:", req2.data

print "-" * 10, "RESP2", "-" * 10
resp2 = urllib2.urlopen(req2)
print 'RESP2 URL:', resp2.geturl()  # new url, if redirected
print 'RESP2 CODE:', resp2.getcode()  # 200
print 'RESP2 HEADERS:', resp2.headers.items()
print "RESP2 HTML:\n", resp2.read()

session_a.php:

<?php
// Start Session
session_start();

// Test redirect - resp1.geturl()
//$_SESSION['id'] = 'redirect';
//header('Location: http://demo.oeey.com/session_b.php');

// Show REQUEST data\
echo "Request Data: \n";
print_r($_REQUEST);

// $_REQUEST is combination of $_GET, $_POST and $_COOKIE.
if(isset($_REQUEST['id'])) {
    $_SESSION['id'] = $_REQUEST['id'];
} else {
    $_SESSION['id'] = rand();
}
echo "Request ID: " . $_SESSION['id'];

session_b.php:

<?php
// Start Session
session_start();

// Show REQUEST data
echo "Request Data: \n";
print_r($_REQUEST);

// Show session data
echo "Session Data: \n";
print_r($_SESSION);

// Show ID
echo "ID: \n";
if(isset($_SESSION['id'])) {
    echo $_SESSION['id'];
} else {
    echo "ID not set";
}


simpler example

### LOGON
url = 'http://apc.oeey.com/login.tgi'
values = {'Username' : 'admin',
          'Password' : 'admin', }
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req, timeout=3)
cookie = response.headers.get('Set-Cookie')
print cookie  # 'DLILPC="W5J/nTupJF0hyrv"; Version=1; Path=/'

### POWER OFF
url = 'http://apc.oeey.com/outlet?8=OFF'
req = urllib2.Request(url)
req.add_header('Cookie', cookie)
response = urllib2.urlopen(req, timeout=3)

### LOGOUT
url = 'http://apc.oeey.com/logout'
req = urllib2.Request(url)
req.add_header('Cookie', cookie)
response = urllib2.urlopen(req, timeout=3)
cookie = response.headers.get('Set-Cookie')
print cookie  # 'DLILPC=""; Version=1; Max-Age=0; Path=/'

URL Cookie Session Tracking

import urllib
import urllib2
import re

url = 'http://apc.oeey.com/Forms/login1'
values = {'login_username' : 'apc',
          'login_password' : 'apc', }
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req, timeout=3)
# exceptions:
#   socket.timeout: timed out
#   urllib2.HTTPError: HTTP Error 403: Forbidden

print response.url  # http://apc.oeey.com/NMC/GGcOPeRq8+FWctMifeoezA/home.htm

# get cookie
match = re.findall('http://apc.oeey.com/NMC/(.*)/home.htm', response.url)
print match
cookie = match[0]

# use cookie:
url2 = 'http://apc.oeey.com/NMC/{}/Forms/outlctrl1'.format(cookie)
values2 = { 'rPDUOutletCtrl': '4',
            'OL_Cntrl_Col1_Btn': '?8,2',
            'submit': 'Next >>'}
data2 = urllib.urlencode(values2)
req2 = urllib2.Request(url2, data2)
response2 = urllib2.urlopen(req2)
print response2.url

# logoff:
url6 = 'http://apc.oeey.com/NMC/{}/logout.htm'.format(cookie)
req6 = urllib2.Request(url6)
response6 = urllib2.urlopen(req6)
print response6.url  # http://apc.oeey.com/NMC/X7tmWWC4oYI0Z4hQbnlLaQ/logout.htm
#html = response6.read()