-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathremotezip.py
315 lines (277 loc) · 10.3 KB
/
remotezip.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
# coding: utf-8
"""
Read remote ZIP files using HTTP range requests
@author: João Pinto (http://stackoverflow.com/users/401041/joao-pinto)
@author: Modified to work with Python 2.6 by Jeff Jacobson.
@see: http://stackoverflow.com/a/7843535
"""
import struct
import urllib2
# import zlib
import cStringIO
from zipfile import ZipExtFile, ZipInfo, BadZipfile
from os.path import join, basename, exists, isdir
from os import mkdir
# The code is mostly adatpted from the zipfile module
# NOTE: ZIP64 is not supported
# The "end of central directory" structure, magic number, size, and indices
# (section V.I in the format document)
structEndArchive = "<4s4H2LH"
stringEndArchive = "PK\005\006"
sizeEndCentDir = struct.calcsize(structEndArchive)
_ECD_SIGNATURE = 0
_ECD_DISK_NUMBER = 1
_ECD_DISK_START = 2
_ECD_ENTRIES_THIS_DISK = 3
_ECD_ENTRIES_TOTAL = 4
_ECD_SIZE = 5
_ECD_OFFSET = 6
_ECD_COMMENT_SIZE = 7
# These last two indices are not part of the structure as defined in the
# spec, but they are used internally by this module as a convenience
_ECD_COMMENT = 8
_ECD_LOCATION = 9
# The "central directory" structure, magic number, size, and indices
# of entries in the structure (section V.F in the format document)
structCentralDir = "<4s4B4HL2L5H2L"
stringCentralDir = "PK\001\002"
sizeCentralDir = struct.calcsize(structCentralDir)
# indexes of entries in the central directory structure
_CD_SIGNATURE = 0
_CD_CREATE_VERSION = 1
_CD_CREATE_SYSTEM = 2
_CD_EXTRACT_VERSION = 3
_CD_EXTRACT_SYSTEM = 4
_CD_FLAG_BITS = 5
_CD_COMPRESS_TYPE = 6
_CD_TIME = 7
_CD_DATE = 8
_CD_CRC = 9
_CD_COMPRESSED_SIZE = 10
_CD_UNCOMPRESSED_SIZE = 11
_CD_FILENAME_LENGTH = 12
_CD_EXTRA_FIELD_LENGTH = 13
_CD_COMMENT_LENGTH = 14
_CD_DISK_NUMBER_START = 15
_CD_INTERNAL_FILE_ATTRIBUTES = 16
_CD_EXTERNAL_FILE_ATTRIBUTES = 17
_CD_LOCAL_HEADER_OFFSET = 18
# The "local file header" structure, magic number, size, and indices
# (section V.A in the format document)
structFileHeader = "<4s2B4HL2L2H"
stringFileHeader = "PK\003\004"
sizeFileHeader = struct.calcsize(structFileHeader)
_FH_SIGNATURE = 0
_FH_EXTRACT_VERSION = 1
_FH_EXTRACT_SYSTEM = 2
_FH_GENERAL_PURPOSE_FLAG_BITS = 3
_FH_COMPRESSION_METHOD = 4
_FH_LAST_MOD_TIME = 5
_FH_LAST_MOD_DATE = 6
_FH_CRC = 7
_FH_COMPRESSED_SIZE = 8
_FH_UNCOMPRESSED_SIZE = 9
_FH_FILENAME_LENGTH = 10
_FH_EXTRA_FIELD_LENGTH = 11
def _http_get_partial_data(url, start_range, end_range=None):
req = urllib2.Request(url)
range_header = "bytes=%s" % start_range
if end_range is not None:
range_header += "-%s" % end_range
req.headers['Range'] = range_header
f = urllib2.urlopen(req)
return f
def _EndRecData(url):
"""Return data from the "End of Central Directory" record, or None.
The data is a list of the nine items in the ZIP "End of central dir"
record followed by a tenth item, the file seek offset of this record."""
ECD = _http_get_partial_data(url, -sizeEndCentDir)
content_range = ECD.headers.get('Content-Range')
filesize = int(content_range.split('/')[1]) if content_range and '/' in content_range else 0
data = ECD.read()
ECD.close()
if data[0:4] == stringEndArchive and data[-2:] == "\000\000":
# the signature is correct and there's no comment, unpack structure
endrec = struct.unpack(structEndArchive, data)
endrec = list(endrec)
# Append a blank comment and record start offset
endrec.append("")
endrec.append(filesize - sizeEndCentDir)
return endrec
# Either this is not a ZIP file, or it is a ZIP file with an archive
# comment. Search the end of the file for the "end of central directory"
# record signature. The comment is the last item in the ZIP file and may be
# up to 64K long. It is assumed that the "end of central directory" magic
# number does not appear in the comment.
# Search by retrieving chunks of 256, 1k and 64k
try_ranges = (1 << 8, 1 << 10, 1 << 16)
for check_range in try_ranges:
ECD = _http_get_partial_data(url, -(check_range + sizeEndCentDir))
data = ECD.read()
content_range = ECD.headers.get('Content-Range')
ECD.close()
download_start = content_range.split('-')[0]
start = data.rfind(stringEndArchive)
if start >= 0:
# found the magic number; attempt to unpack and interpret
recData = data[start:start+sizeEndCentDir]
endrec = list(struct.unpack(structEndArchive, recData))
commentSize = endrec[_ECD_COMMENT_SIZE] #as claimed by the zip file
comment = data[start+sizeEndCentDir:start+sizeEndCentDir+commentSize]
endrec.append(comment)
endrec.append(download_start + start)
return endrec
raise IOError
class HTTPZipFile:
def __init__(self, url):
self.url = url
self.NameToInfo = {} # Find file info given name
self.filelist = [] # List of ZipInfo instances for archive
self.pwd = None
self.comment = ''
self.debug = 0
self._RealGetContents()
def _RealGetContents(self):
"""Read in the table of contents for the ZIP file."""
try:
endrec = _EndRecData(self.url)
except IOError:
raise BadZipfile("File is not a zip file")
if not endrec:
raise BadZipfile, "File is not a zip file"
if self.debug > 1:
print endrec
size_cd = endrec[_ECD_SIZE] # bytes in central directory
offset_cd = endrec[_ECD_OFFSET] # offset of central directory
self.comment = endrec[_ECD_COMMENT] # archive comment
# "concat" is zero, unless zip was concatenated to another file
concat = endrec[_ECD_LOCATION] - size_cd - offset_cd
#if endrec[_ECD_SIGNATURE] == stringEndArchive64:
# # If Zip64 extension structures are present, account for them
# concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator)
if self.debug > 2:
inferred = concat + offset_cd
print "given, inferred, offset", offset_cd, inferred, concat
# self.start_dir: Position of start of central directory
self.start_dir = offset_cd + concat
ECD = _http_get_partial_data(self.url, self.start_dir, self.start_dir+size_cd-1)
data = ECD.read()
ECD.close()
fp = cStringIO.StringIO(data)
total = 0
while total < size_cd:
centdir = fp.read(sizeCentralDir)
if centdir[0:4] != stringCentralDir:
raise BadZipfile, "Bad magic number for central directory"
centdir = struct.unpack(structCentralDir, centdir)
if self.debug > 2:
print centdir
filename = fp.read(centdir[_CD_FILENAME_LENGTH])
# Create ZipInfo instance to store file information
x = ZipInfo(filename)
x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
x.comment = fp.read(centdir[_CD_COMMENT_LENGTH])
x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET]
(x.create_version, x.create_system, x.extract_version, x.reserved,
x.flag_bits, x.compress_type, t, d,
x.CRC, x.compress_size, x.file_size) = centdir[1:12]
x.volume, x.internal_attr, x.external_attr = centdir[15:18]
# Convert date/time code to (year, month, day, hour, min, sec)
x._raw_time = t
x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F,
t>>11, (t>>5)&0x3F, (t&0x1F) * 2 )
x._decodeExtra()
x.header_offset = x.header_offset + concat
x.filename = x._decodeFilename()
self.filelist.append(x)
self.NameToInfo[x.filename] = x
# update total bytes read from central directory
total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH]
+ centdir[_CD_EXTRA_FIELD_LENGTH]
+ centdir[_CD_COMMENT_LENGTH])
if self.debug > 2:
print "total", total
def namelist(self):
"""Return a list of file names in the archive."""
l = []
for data in self.filelist:
l.append(data.filename)
return l
def infolist(self):
"""Return a list of class ZipInfo instances for files in the
archive."""
return self.filelist
def printdir(self):
"""Print a table of contents for the zip file."""
print "%-46s %19s %12s" % ("File Name", "Modified ", "Size")
for zinfo in self.filelist:
date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time[:6]
print "%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size)
def getinfo(self, name):
"""Return the instance of ZipInfo given 'name'."""
info = self.NameToInfo.get(name)
if info is None:
raise KeyError(
'There is no item named %r in the archive' % name)
return info
def open(self, name, pwd=None):
"""Return file-like object for 'name'."""
if not self.url:
raise RuntimeError, \
"Attempt to read ZIP archive that was already closed"
zinfo = self.getinfo(name)
offset = zinfo.header_offset
f = _http_get_partial_data(self.url, offset, offset+sizeFileHeader-1)
fheader = f.read()
f.close()
fheader = struct.unpack(structFileHeader, fheader)
offset += sizeFileHeader
f = _http_get_partial_data(self.url, offset, offset+fheader[_FH_FILENAME_LENGTH]-1)
fname = f.read()
f.close()
if fname != zinfo.orig_filename:
raise BadZipfile, \
'File name in directory "%s" and header "%s" differ.' % (
zinfo.orig_filename, fname)
is_encrypted = zinfo.flag_bits & 0x1
if is_encrypted:
raise RuntimeError, "File %s is encrypted, " \
"not supported." % name
offset += fheader[_FH_FILENAME_LENGTH]+fheader[_FH_EXTRA_FIELD_LENGTH]
f = _http_get_partial_data(self.url, offset, offset+fheader[_FH_COMPRESSED_SIZE]-1)
data = f.read()
return ZipExtFile(cStringIO.StringIO(data), 'r', zinfo)
#return ZipExtFile(cStringIO.StringIO(data), zinfo)
if __name__ == "__main__":
# Some tests
destDir = "tmp"
# Create the destination directory if it does not already exist.
if not exists(destDir):
mkdir(destDir)
elif not isdir(destDir):
raise "Destination directory path exists, but is not a directory."
# link="http://dfn.dl.sourceforge.net/project/filezilla/FileZilla_Client/3.5.1/FileZilla_3.5.1_win32.zip"
link = "http://tod.faa.gov/tod/public/DOFS/DOF_120304.zip"
hzfile = HTTPZipFile(link)
hzfile.printdir()
#for fname in ('FileZilla-3.5.1/GPL.html', 'FileZilla-3.5.1/resources/blukis/48x48/filter.png', 'FileZilla-3.5.1/resources/finished.wav'):
for fname in (('53-WA.Dat',)):
source_name = fname
dest_fname = join(destDir, basename(fname))
print "Extracing %s to %s" % (source_name, dest_fname)
with hzfile.open(source_name) as f:
data = f.read()
new_file = open(dest_fname, 'wb')
new_file.write(data)
new_file.close()
##f = hzfile.open(source_name)
##new_file = None
##try:
## data = f.read()
## new_file = open(dest_fname, 'wb') # must open file as binary (unless you know you're only dealing with text files).
## new_file.write(data)
## new_file.close()
##finally:
## f.close()
## if new_file is not None:
## new_file.close()