-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstorage_db.py
97 lines (87 loc) · 2.89 KB
/
storage_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
import psycopg2
from psycopg2.extras import execute_values
import pandas as pd
from dotenv import load_dotenv
# get env
load_dotenv()
DBNAME = os.getenv('DBNAME')
DBUSER = os.getenv('DBUSER')
DBPASSWORD = os.getenv('DBPASSWORD')
DBHOST = os.getenv('DBHOST')
DBPORT = os.getenv('DBPORT')
class DBStorage():
def __init__(self):
# connection to PostgreSQL
self.con = psycopg2.connect(
dbname=DBNAME,
user=DBUSER,
password=DBPASSWORD,
host=DBHOST,
port=DBPORT
)
self.create_tables()
def create_tables(self):
cursor = self.con.cursor()
results_table = '''
CREATE TABLE IF NOT EXISTS results (
id SERIAL PRIMARY KEY,
query TEXT,
rank INTEGER,
link TEXT,
title TEXT,
snippet TEXT,
html TEXT,
created TIMESTAMP,
relevance INTEGER,
UNIQUE(query, link)
);
'''
cursor.execute(results_table)
self.con.commit()
cursor.close()
# method to get results on query from DB
def query_results(self, query):
cursor = self.con.cursor()
cursor.execute(f"SELECT * FROM results WHERE query = %s ORDER BY rank ASC;", (query,))
rows = cursor.fetchall()
cursor.close()
columns = [desc[0] for desc in cursor.description]
df = pd.DataFrame(rows, columns=columns)
return df
def insert_row(self, values):
cursor = self.con.cursor()
try:
insert_query = '''
INSERT INTO results (query, rank, link, title, snippet, html, created, relevance)
VALUES %s
'''
# cleaning from NUL, this method protect us from invalid response when we scrap html pages
cleaned_values = []
for row in values:
cleaned_row = []
for value in row:
if isinstance(value, str):
cleaned_row.append(value.replace('\x00', ''))
else:
cleaned_row.append(value)
cleaned_values.append(tuple(cleaned_row))
execute_values(cursor, insert_query, cleaned_values)
self.con.commit()
except psycopg2.IntegrityError:
pass
cursor.close()
def update_relevance(self, query, link, relevance):
cursor = self.con.cursor()
update_query = '''
UPDATE results
SET relevance = %s
WHERE query = %s AND link = %s
'''
cursor.execute(update_query, (relevance, query, link))
self.con.commit()
cursor.close()
def __del__(self):
# CLOSE CONNECTION TO DB
self.con.close()
# also we can add machine learning based on relevance (sql row):