Python/urllib

From Omnia
Jump to: navigation, search

urllib

Basic Get

import urllib2
response = urllib2.urlopen('http://python.org/')
html = response.read()

POST data:

import urllib
import urllib2

url = 'http://www.someserver.com/cgi-bin/register.cgi'
values = {'name' : 'Michael Foord',
          'location' : 'Northampton',
          'language' : 'Python' }

data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
the_page = response.read()

print the_page

Request Headers

...
user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0"
headers = { 'User-Agent' : user_agent }
data = urllib.urlencode(values)
req = urllib2.Request(url, data, headers)
# or
request = urllib2.Request('http://your.tld/...')
request.add_header('User-Agent', 'some fake agent string')
request.add_header('Referer', 'fake referrer')
...
response = urllib2.urlopen(request)

Response Header

print response.info().getheader('Content-Type')
print response.info().headers  # list
response.url  # response url

Cookies

import cookielib
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
# user opener for all future requests
req = urllib2.Request(url, postdata, headers)
response = opener.open(req)

References:

Progress Bar

Progress bar: [1]

global rem_file # global variable to be used in dlProgress

urllib.urlretrieve(rem_file, loc_file, reporthook=dlProgress)

def dlProgress(count, blockSize, totalSize):
    percent = int(count*blockSize*100/totalSize)
    sys.stdout.write("\r" + rem_file + "...%d%%" % percent)
    sys.stdout.flush()


Read web page 2.x

import urllib
resp = urllib.urlopen('http://www.python.org')
html = resp.read()
head = resp.headers
headers = head.keys()  # or 'for header in head' or 'head.items()'
head['set-cookie']

URL Encode

#!/usr/bin/env python

# urlencode a message from stdin or command parameters
# author: Kenneth Burgener <kenneth@k.ttak.org> (c) 2013

import urllib
import sys
import select

msg = ""
if select.select([sys.stdin,],[],[],0.0)[0]:
    msg = sys.stdin.readline().strip() + " "
if len(sys.argv) > 1:
    msg += " ".join(sys.argv[1:])
msg = msg.strip()
print urllib.quote(msg)

cookies

cookies: [2]

import urllib2
req1 = urllib2.Request(url1)
response = urllib2.urlopen(req1)
cookie = response.headers.get('Set-Cookie')
# Use the cookie is subsequent requests
req2 = urllib2.Request(url2)
req2.add_header('cookie', cookie)
response = urllib2.urlopen(req2)

Read web page 3.x

import urllib.request
resp = urllib.request.urlopen('http://www.python.org')
html = resp.read()

Convert byte characters to string

print(f.read(100).decode('utf-8'))

Download File

Download file: (2.x)

import urllib
urllib.urlretrieve(url, filename)

Download file: (3.x)

import urllib.request
urllib.request.urlretrieve(url, filename)

Open web browser

import webbrowser
webbrowser.open('http://www.google.com')

HTTP Basic Authentication

HTTP Basic Authentication [3]

import urllib2, base64
request = urllib2.Request("http://api.foursquare.com/v1/user")
base64string = base64.encodestring('%s:%s' % (username, password)).replace('\n', )
request.add_header("Authorization", "Basic %s" % base64string)   
result = urllib2.urlopen(request)

HTTP Basic Authentication Handler

HTTP Basic Authentication Handler: [4] [5] [6]

import urllib2
auth = urllib2.HTTPPasswordMgrWithDefaultRealm()
auth.add_password(None, 'http://twitter.com/account/', username, password)
auth_handler = urllib2.HTTPBasicAuthHandler(auth)
url_opener = urllib2.build_opener(auth_handler)
# Note: this will change the default opener.
#    if you do not wish to do this, simply use "url_opener.open(url_request)" everywhere.
urllib2.install_opener(url_opener)
url_request = urllib2.Request('https://api.twitter.com/1/statuses/user_timeline.json?%s' % twitter_args)
url_output = urllib2.urlopen(url_request).read()
theurl = 'http://10.10.10.135/image.jpg'
req = urllib2.Request(theurl)
base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
authheader =  "Basic %s" % base64string
req.add_header("Authorization", authheader)
resp = urllib2.urlopen(req)
img = resp.read()
# the following has the problem of being required to already know the realm
import urllib2
TRIM_API_URL = 'http://api.tr.im/api'
auth_handler = urllib2.HTTPBasicAuthHandler()
auth_handler.add_password(realm='tr.im',
    uri=TRIM_API_URL,
    user=USERNAME,
    passwd=PASSWORD)
url_opener = urllib2.build_opener(auth_handler)
urllib2.install_opener(url_opener)
response = urllib2.urlopen('%s/trim_simple?url=%s'
    % (TRIM_API_URL, url_to_trim))
url = response.read().strip()

urllib Session Tracking

Sessions Tracking:

import urllib
import urllib2
import random

print "-" * 10, "REQ1", "-" * 10
data = {'username': 'test',
        'password': 'password1',
        'id': str(random.randint(1, 100)),
        }
data = urllib.urlencode(data)
req1 = urllib2.Request('http://demo.oeey.com/session_a.php', data)
req1.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0')
print "REQ1 HEADERS:", req1.headers.items()
print "REQ1 DATA:", req1.data

print "-" * 10, "RESP1", "-" * 10
resp1 = urllib2.urlopen(req1)
cookie = resp1.headers.get('Set-Cookie')
print 'RESP1 URL:', resp1.geturl()  # new url, if redirected
print 'RESP1 CODE:', resp1.getcode()  # 200
print 'RESP1 COOKIE:', cookie
print 'RESP1 HEADERS:', resp1.headers.items()
print "RESP1 HTML:\n", resp1.read()

print "-" * 10, "REQ2", "-" * 10
data = {
    'product': '3',
        }
data = urllib.urlencode(data)
req2 = urllib2.Request('http://demo.oeey.com/session_b.php', data)
req2.add_header('Cookie', cookie)
req2.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0')
print "REQ2 HEADERS:", req2.headers.items()
print "REQ2 DATA:", req2.data

print "-" * 10, "RESP2", "-" * 10
resp2 = urllib2.urlopen(req2)
print 'RESP2 URL:', resp2.geturl()  # new url, if redirected
print 'RESP2 CODE:', resp2.getcode()  # 200
print 'RESP2 HEADERS:', resp2.headers.items()
print "RESP2 HTML:\n", resp2.read()

session_a.php:

<?php
// Start Session
session_start();

// Test redirect - resp1.geturl()
//$_SESSION['id'] = 'redirect';
//header('Location: http://demo.oeey.com/session_b.php');

// Show REQUEST data\
echo "Request Data: \n";
print_r($_REQUEST);

// $_REQUEST is combination of $_GET, $_POST and $_COOKIE.
if(isset($_REQUEST['id'])) {
    $_SESSION['id'] = $_REQUEST['id'];
} else {
    $_SESSION['id'] = rand();
}
echo "Request ID: " . $_SESSION['id'];

session_b.php:

<?php
// Start Session
session_start();

// Show REQUEST data
echo "Request Data: \n";
print_r($_REQUEST);

// Show session data
echo "Session Data: \n";
print_r($_SESSION);

// Show ID
echo "ID: \n";
if(isset($_SESSION['id'])) {
    echo $_SESSION['id'];
} else {
    echo "ID not set";
}


simpler example

### LOGON
url = 'http://apc.oeey.com/login.tgi'
values = {'Username' : 'admin',
          'Password' : 'admin', }
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req, timeout=3)
cookie = response.headers.get('Set-Cookie')
print cookie  # 'DLILPC="W5J/nTupJF0hyrv"; Version=1; Path=/'

### POWER OFF
url = 'http://apc.oeey.com/outlet?8=OFF'
req = urllib2.Request(url)
req.add_header('Cookie', cookie)
response = urllib2.urlopen(req, timeout=3)

### LOGOUT
url = 'http://apc.oeey.com/logout'
req = urllib2.Request(url)
req.add_header('Cookie', cookie)
response = urllib2.urlopen(req, timeout=3)
cookie = response.headers.get('Set-Cookie')
print cookie  # 'DLILPC=""; Version=1; Max-Age=0; Path=/'

URL Cookie Session Tracking

import urllib
import urllib2
import re

url = 'http://apc.oeey.com/Forms/login1'
values = {'login_username' : 'apc',
          'login_password' : 'apc', }
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req, timeout=3)
# exceptions:
#   socket.timeout: timed out
#   urllib2.HTTPError: HTTP Error 403: Forbidden

print response.url  # http://apc.oeey.com/NMC/GGcOPeRq8+FWctMifeoezA/home.htm

# get cookie
match = re.findall('http://apc.oeey.com/NMC/(.*)/home.htm', response.url)
print match
cookie = match[0]

# use cookie:
url2 = 'http://apc.oeey.com/NMC/{}/Forms/outlctrl1'.format(cookie)
values2 = { 'rPDUOutletCtrl': '4',
            'OL_Cntrl_Col1_Btn': '?8,2',
            'submit': 'Next >>'}
data2 = urllib.urlencode(values2)
req2 = urllib2.Request(url2, data2)
response2 = urllib2.urlopen(req2)
print response2.url

# logoff:
url6 = 'http://apc.oeey.com/NMC/{}/logout.htm'.format(cookie)
req6 = urllib2.Request(url6)
response6 = urllib2.urlopen(req6)
print response6.url  # http://apc.oeey.com/NMC/X7tmWWC4oYI0Z4hQbnlLaQ/logout.htm
#html = response6.read()