-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch.py
87 lines (76 loc) · 2.66 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import requests
from requests.exceptions import RequestException
import pandas as pd
from storage_db import DBStorage
from urllib.parse import quote_plus
from datetime import datetime
from dotenv import load_dotenv
# get env
load_dotenv()
RESULT_COUNT = int(os.getenv('RESULT_COUNT'))
SEARCH_URL = os.getenv('SEARCH_URL')
SEARCH_KEY = os.getenv('SEARCH_KEY')
SEARCH_ID = os.getenv('SEARCH_ID')
# method to get response from Google Custom Search
def search_api(query, pages=int(RESULT_COUNT / 10)):
results = []
for i in range(0, pages):
start = i * 10 + i
url = SEARCH_URL.format(
key=SEARCH_KEY,
cx=SEARCH_ID,
# quote_plus is formatting our query to url format (without space etc), "hello world" -> "hello+world"
query=quote_plus(query),
start=start
)
response = requests.get(url)
data = response.json()
results += data["items"]
res_df = pd.DataFrame.from_dict(results)
res_df["rank"] = list(range(1, res_df.shape[0] + 1))
res_df = res_df[["link", "rank", "snippet", "title"]]
return res_df
# method for downloading html from each page from query
def scrape_page(links):
html = []
for link in links:
try:
data_page = requests.get(link, timeout=5)
html.append(data_page.text)
except RequestException:
html.append("")
return html
"""
MAIN SEARCH METHOD
How it works:
1. Checking the Database to see if this query already been searched
2. If it is return results
3. If it isn't:
- querying the Google API
- get new results
- format them properly
- save them to the DB
- return them
"""
def search(query):
columns = ["query", "rank", "link", "title", "snippet", "html", "created", "relevance"]
# initializing of class from storage_db
# instance of class DBStorage
storage = DBStorage()
stored_results = storage.query_results(query)
if stored_results.shape[0] > 0:
stored_results["created"] = pd.to_datetime(stored_results["created"])
return stored_results[columns]
# querying Google API if db doesn't have this results
results = search_api(query)
results["html"] = scrape_page(results["link"])
results = results[results["html"].str.len() > 0].copy()
results["query"] = query
results["created"] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
results["relevance"] = None
results = results[columns]
rows_to_insert = []
results.apply(lambda x: rows_to_insert.append(tuple(x)), axis=1)
storage.insert_row(rows_to_insert)
return results