-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathutil.py
138 lines (107 loc) · 3.85 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import sqlite3
import json
from time import time
from multiprocessing import Process
from tld import get_tld
import ipaddress
from os.path import join, isfile, isdir, dirname
import glob
from shutil import copyfile
try:
from urlparse import urlparse
except ImportError:
from urllib.parse import urlparse
CRAWL_DB_EXT = ".sqlite"
DB_SCHEMA_SUFFIX = "_db_schema.txt"
# print progress every million rows
PRINT_PROGRESS_EVERY = 10**6
def load_alexa_ranks(alexa_csv_path):
site_ranks = dict()
for line in open(alexa_csv_path):
parts = line.strip().split(',')
site_ranks[parts[1]] = int(parts[0])
return site_ranks
def get_column_names(table_name, cursor):
"""Return the column names for a table.
Modified from https://stackoverflow.com/a/38854129
"""
cursor.execute("SELECT * FROM %s" % table_name)
return " ".join([member[0] for member in cursor.description])
def get_table_and_column_names(db_path):
"""Return the table and column names for a database.
Modified from: https://stackoverflow.com/a/33100538
"""
db_schema_str = ""
db = sqlite3.connect(db_path)
cursor = db.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
for table_name in cursor.fetchall():
table_name = table_name[0]
db_schema_str += "%s %s\n" % (table_name, get_column_names(table_name,
cursor))
return db_schema_str
def start_worker_processes(worker_function, queue, num_workers=1):
workers = []
for _ in xrange(num_workers):
worker_proc = Process(target=worker_function, args=(queue,))
worker_proc.start()
workers.append(worker_proc)
return workers
def get_tld_or_host(url):
try:
return get_tld(url, fail_silently=False)
except Exception:
hostname = urlparse(url).hostname
try:
ipaddress.ip_address(hostname)
return hostname
except Exception:
return None
def is_third_party(req_url, top_level_url):
# TODO: when we have missing information we return False
# meaning we think this is a first-party
# let's make sure this doesn't have any strange side effects
# We can also try returning `unknown`.
if not top_level_url:
return (None, "", "")
site_ps1 = get_tld_or_host(top_level_url)
if site_ps1 is None:
return (None, "", "")
req_ps1 = get_tld_or_host(req_url)
if req_ps1 is None:
# print url
return (None, "", site_ps1)
if (req_ps1 == site_ps1):
return (False, req_ps1, site_ps1)
return (True, req_ps1, site_ps1)
def copy_if_not_exists(src, dst):
if not isfile(dst):
print "Copying %s to %s" % (src, dst)
copyfile(src, dst)
def read_json(json_path):
return json.load(open(json_path))
def dump_as_json(obj, json_path):
with open(json_path, 'w') as f:
json.dump(obj, f)
def get_crawl_db_path(crawl_dir):
sqlite_files = glob.glob(join(crawl_dir, "*" + CRAWL_DB_EXT))
assert len(sqlite_files) == 1
return sqlite_files[0]
def get_crawl_dir(crawl_dir):
if isdir(crawl_dir):
return crawl_dir
else:
print "Missing crawl dir (archive name mismatch)", crawl_dir
crawl_dir_pattern = join(dirname(crawl_dir), "*201*")
crawl_dir = glob.glob(crawl_dir_pattern)
assert len(crawl_dir) == 1
return crawl_dir[0]
def print_progress(t0, processed, num_rows):
if processed % PRINT_PROGRESS_EVERY == 0:
elapsed = time() - t0
speed = processed / elapsed
progress = 100 * processed / num_rows
remaining = (num_rows - processed) / speed
print "Processed: %iK (%0.2f%%) Speed: %d rows/s | Elapsed %0.2f"\
" | Remaining %d mins" % (
processed/1000, progress, speed, elapsed, remaining / 60)