-
Notifications
You must be signed in to change notification settings - Fork 38
/
httpbot.py
174 lines (153 loc) · 6.54 KB
/
httpbot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012 Rodrigo Silva (MestreLion) <[email protected]>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. See <http://www.gnu.org/licenses/gpl.html>
#
# urllib2 wrapper for a simpler, higher-level API
import os
import sys
import urllib
import logging
import hashlib
if sys.version_info.major < 3:
import urllib2 # @UnresolvedImport
import urlparse # @UnresolvedImport
HTTPError = urllib2.HTTPError
else:
import urllib.request as urllib2
import urllib.parse as urlparse
urllib.unquote = urlparse.unquote
HTTPError = urllib.error.HTTPError
from lxml import html # Debian/Ubuntu: python-lxml
try:
import progressbar # Debian/Ubuntu: python-progressbar
except ImportError:
progressbar = None
log = logging.getLogger(__name__)
class HttpBot(object):
""" Base class for other handling basic http tasks like requesting a page,
download a file and cache content. Not to be used directly
"""
def __init__(self, base_url="", tag="", cookiejar=None, debug=False):
self.tag = tag
hh = urllib2.HTTPHandler( debuglevel=1 if debug else 0)
hsh = urllib2.HTTPSHandler(debuglevel=1 if debug else 0)
cp = urllib2.HTTPCookieProcessor(cookiejar)
self._opener = urllib2.build_opener(hh, hsh, cp)
scheme, netloc, path, q, f = urlparse.urlsplit(base_url, "http")
if not netloc:
netloc, _, path = path.partition('/')
self.base_url = urlparse.urlunsplit((scheme, netloc, path, q, f))
def get(self, url, postdata=None):
""" Send an HTTP request, either GET (if no postdata) or POST
Keeps session and other cookies.
postdata is a dict with name/value pairs
url can be absolute or relative to base_url
"""
url = urlparse.urljoin(self.base_url, url)
if postdata:
return self._opener.open(url, urllib.urlencode(postdata))
else:
return self._opener.open(url)
def download(self, url, path=None, md5sum=None, expected_size=0,
progress=True, keep_partial=False, chunk_size=0):
show = progress and progressbar
chunk_size = chunk_size or 32*1024 # 32K is arbitrary
download = self.get(url)
# Set download path
# If save name is not set, use the downloaded file name
# "Not set" means either path is an existing dir or ends with a trailing '/'
path = os.path.expanduser(path or ".")
if os.path.isdir(path) or not os.path.basename(path):
#TODO: Parse Content-Disposition header for filename
basename = urllib.unquote(os.path.basename
(urlparse.urlsplit(download.geturl()).path))
path = os.path.join(path, basename)
log.info("Downloading to %s", path)
# Handle dir
dirname, _ = os.path.split(path)
try:
log.debug("Creating destination directory %s", dirname)
os.makedirs(dirname)
except OSError as e:
# Ignore if destination directory exists, raise otherwise
if not (e.errno == 17 and os.path.isdir(dirname)):
raise
size = expected_size or int(download.info().get('Content-Length', 0))
if md5sum and os.path.isfile(path) and os.path.getsize(path) == size:
log.debug("File already exists, checking its MD5")
if filehash(path, hashlib.md5()) == md5sum:
log.debug("MD5 matches, skipping download and using cached file")
return path
else:
log.debug("MD5 does not match, downloading")
if show:
pbar = progressbar.ProgressBar(widgets=[
' ', progressbar.Percentage(), ' of %.1f MiB' % (size/1024.0**2),
' ', progressbar.Bar('.'),
' ', progressbar.FileTransferSpeed(),
' ', progressbar.ETA(),
' '], maxval=size).start()
# TODO: Perhaps overkill, but network read and md5/sha1/disk write could be done async
# Could save around 30s for a 1-GiB file
completed = False
try:
with open(path, 'wb') as f:
for data in iter(lambda: download.read(chunk_size), b''):
f.write(data)
if show:
pbar.update(min([size, pbar.value + chunk_size]))
completed = True
except KeyboardInterrupt:
pass
finally:
if show:
pbar.finish()
if not completed:
log.warning("Download aborted")
if not keep_partial:
log.debug("Removing partial file")
os.remove(path)
if completed:
if not md5sum:
return path
realhash = filehash(path, hashlib.md5())
if md5sum == realhash:
log.debug("Download MD5 match: %s", md5sum)
return path
else:
log.warning("Download MD5 does not match - file is likely corrupt.")
log.debug("Expected and downloaded MD5:\n%s\n%s", md5sum, realhash)
def quote(self, text):
""" Quote a text for URL usage, similar to urllib.quote_plus.
Handles unicode and also encodes "/"
"""
if isinstance(text, unicode):
text = text.encode('utf-8')
return urllib.quote_plus(text, safe=b'')
def parse(self, url, postdata=None):
""" Parse an URL and return an etree ElementRoot.
Assumes UTF-8 encoding
"""
return html.parse(self.get(url, postdata),
parser=html.HTMLParser(encoding='utf-8'))
def filehash(path, hashobj=None, chunk_size=0):
hashobj = hashobj or hashlib.md5()
chunk = chunk_size or 32*1024
with open(path, 'rb') as f:
for data in iter(lambda: f.read(chunk), b''):
hashobj.update(data)
return hashobj.hexdigest()