-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpackage_statistic.py
executable file
·124 lines (106 loc) · 4.63 KB
/
package_statistic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python3
import os
import sys
import urllib.request
import gzip
from collections import defaultdict
from heapq import nlargest
contents_dir = './contents' # The directory stores the Contents indices files
# Url of the Debian mirror
mirror_url = "http://ftp.uk.debian.org/debian/dists/stable/main/"
architectures_set = set(['amd64', 'arm64', 'armel', 'armhf',
'i386', 'mips64el', 'mipsel', 'ppc64el', 's390x']) # set of the names of architectures
class Package_Frequency: # Class to store package and number of files associated with it
def __init__(self, pkg_name, freq):
self.pkg_name = pkg_name
self.freq = freq
def __lt__(self, other): # Overload the operator for customizing heap
if self.freq != other.freq:
return self.freq < other.freq # Compare the number of associated files first
# Compare the package name if the number of associated files are the same
return self.pkg_name > other.pkg_name
def download_content(architecture):
'''
Download the specific Contents indices file
param architecture: str, the name of the architecture
return:
str, the local path of the Content indices file if the architecture is valid
None, if not
'''
if architecture not in architectures_set:
print("Error: %s is not a valid architecture." % architecture)
print("Valid architectures: ", list(architectures_set))
return None
# the name of the Contents indices file
content_name = 'Contents-%s' % (architecture)
# the local path of the file
content_path = os.path.join(contents_dir, content_name)
if os.path.exists(content_path): # already download the file
return content_path
try:
urllib.request.urlretrieve(
mirror_url+content_name+'.gz', content_path+'.gz') # download the gzip file
except Exception as e:
print(e)
print("Error: There is no Content for the %s architecture." % architecture)
return None
g_file = gzip.GzipFile(content_path+'.gz')
open(content_path, "wb+").write(g_file.read()) # decompress the gzip file
g_file.close()
os.remove(content_path+'.gz') # delete the decompress file
return content_path
def deprecated_package_name(pkg_name):
# deprecated package name including $AREA
return True if len(pkg_name.split('/')) > 2 or pkg_name == "LOCATION" else False
def package_counter(content_path):
'''
Count the number of assocaited files of each package in the Content indices file.
param content_path: str, the local path of the Content indices file
return: dict, the hashmap of (package name, the number of assocaited files) pair
'''
pkg_freq = defaultdict(int)
try:
with open(content_path) as pkg_file:
for line in pkg_file:
# split the packages and multiple package names
pkgs = line.split()[-1].split(',')
for pkg in pkgs:
if deprecated_package_name(pkg):
continue
pkg_freq[pkg] += 1
except Exception as e:
print(e)
print("Error: The Content path %s is invalid." % content_path)
exit()
return pkg_freq
def print_top_package(pkg_freq, size=10):
'''
Print out the top <size> packages that have the most files associated with them
param pkg_freq: dict, the hashmap of the number of assocaited files of each package
param size: the number of the top packages to be printed
'''
ans = nlargest(size, [Package_Frequency(pkg_name, freq)
for pkg_name, freq in pkg_freq.items()]) # Use heap to get the top <size> files
max_len = str(max(len(pkg.pkg_name)
for pkg in ans)+1) # for output alignment
print("%s\t%s " %
(format(" package name", "<"+max_len), "number of files"))
for i in range(len(ans)):
# print out the records based on the requirement
print("%i. %s\t%s " %
(i+1, format(ans[i].pkg_name, "<"+max_len), ans[i].freq))
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Error: Missing architecture!")
print("Usage: %s <architecture>" % (sys.argv[0]))
exit()
architecture = sys.argv[1] # get the architecture argument
if not os.path.exists(contents_dir):
os.makedirs(contents_dir)
content_path = download_content(architecture)
if not content_path: # download fails
exit()
pkg_freq = package_counter(content_path)
if not pkg_freq: # no valid files
exit()
print_top_package(pkg_freq) # print out the result