Skip to content

Commit

Permalink
Merge pull request #16 from hathitrust/DEV-1479-create_catalog-config…
Browse files Browse the repository at this point in the history
…-query

applied some changes to reproduce full-text search queries on Catalog…
  • Loading branch information
liseli authored Feb 7, 2025
2 parents f00c97a + ee1ebb4 commit 9a5f570
Show file tree
Hide file tree
Showing 11 changed files with 145 additions and 58 deletions.
26 changes: 26 additions & 0 deletions config_files/catalog_search/config_query.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# This config file contains the parameters used for creating the Solr query in Catalog Search. It is for testing purposes only.

all:
pf: # Fields used for boosting results
- [title_ab, 10000]
- [title_a, 8000]
- [author, 1600]
- [author2, 800]
- [author_top, 100]
qf: # Fields used to find the results
- [title, 10]
- [author, 80]
- [author2, 50]
- [author_top, 30]
mm: "100%"
tie: 0.5
parser: "edismax"
debug: "all"

titleonly:
parser: "edismax"
debug: "all"
qf:
- [title, 500000]
mm: "100%"
tie: 0.9
12 changes: 9 additions & 3 deletions config_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,13 @@
sys.path.insert(0, current_dir)

# Full-text search config parameters
SOLR_URL = {
"prod": "http://macc-ht-solr-lss-1.umdl.umich.edu:8081/solr/core-1x/query",
"dev": "http://solr-lss-dev:8983/solr/core-x/query"
FULL_TEXT_SOLR_URL = {
"prod": "http://macc-ht-solr-lss-1.umdl.umich.edu:8081/solr/core-1x",
"dev": "http://solr-lss-dev:8983/solr/core-x"
}

CATALOG_SOLR_URL = {
"dev": "http://localhost:8983"
}

FULL_TEXT_SEARCH_SHARDS_X = ','.join([f"http://solr-sdr-search-{i}:8081/solr/core-{i}x" for i in range(1, 12)])
Expand All @@ -29,6 +33,8 @@


def default_solr_params(env: str = "prod"):
# TODO: Add shards is only for prod environment and full-text search, then I have to change this function to
# ensure we have access to Catalog in prod environment.
"""
Return the default solr parameters
:param env:
Expand Down
67 changes: 48 additions & 19 deletions ht_full_text_search/export_all_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,12 @@
import yaml
from requests.auth import HTTPBasicAuth

current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
sys.path.insert(0, current_dir)
# Add the parent directory ~/ht_full_text_search into the PYTHONPATH.
current = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent = os.path.dirname(current)
sys.path.insert(0, parent)

config_file_path = os.path.join(os.path.abspath(os.path.join(os.getcwd())),
'config_files', 'full_text_search', 'config_query.yaml')

from config_search import default_solr_params, SOLR_URL
from config_search import default_solr_params, FULL_TEXT_SOLR_URL

# This is a quick attempt to do a query to solr more or less as we issue it in
# production and to then export all results using the cursorMark results
Expand Down Expand Up @@ -56,33 +55,39 @@ def process_results(item: dict) -> str:
})


def solr_query_params(config_file=config_file_path, conf_query="ocr"):
def solr_query_params(query_config_file=None, conf_query="ocr"):

""" Prepare the Solr query parameters
:param config_file: str, path to the config file
:param conf_query: str, query configuration name
:param query_config_file: str, path to the config file with the queries
:param conf_query: str, query configuration name. Each query has a name to identify it.
:return: str, formatted Solr query parameters
"""

with open(config_file, "r") as file:
with open(query_config_file, "r") as file:
data = yaml.safe_load(file)[conf_query]

params = {
"pf": SolrExporter.create_boost_phrase_fields(data["pf"]),
"qf": SolrExporter.create_boost_phrase_fields(data["qf"]),
"mm": data["mm"],
"tie": data["tie"]
}

if "pf" in data:
params.update({"pf": SolrExporter.create_boost_phrase_fields(data["pf"])})
if "qf" in data:
params.update({"qf": SolrExporter.create_boost_phrase_fields(data["qf"])})

return " ".join([f"{k}='{v}'" for k, v in params.items()])


def make_query(query):
def make_query(query, query_config_file=None, conf_query="ocr"):

""" Prepare the Solr query string
:param conf_query:
:param query_config_file:
:param query: str, query string
:return: str, formatted Solr query string
"""
return f"{{!edismax {solr_query_params()}}} {query}"
return f"{{!edismax {solr_query_params(query_config_file=query_config_file, conf_query=conf_query)}}} {query}"


class SolrExporter:
Expand All @@ -94,6 +99,10 @@ def __init__(self, solr_url: str, env: str, user=None, password=None):
:param env: str, environment. It could be dev or prod
"""

# TODO: We should load in memory the query configuration file to avoid reading it each time we run a query.
# SolrExporter should be re-implemented following the design of ht_query/ht_query.py, ht_search/ht_search.py
# We should create an exporter as part of this structure and we should create separate classes to manage the
# Catalog and FullText Solr clusters.
self.solr_url = f"{solr_url}/query"
self.environment = env
self.headers = {"Content-Type": "application/json"}
Expand All @@ -116,19 +125,35 @@ def send_query(self, params):

return response

def run_cursor(self, query):
def run_cursor(self, query_string, query_config_path=None, conf_query="ocr"):

# TODO: This function will receive the query string and the query type (ocr or all). From memory, it will
# instantiate the query parameters (params["q"]) and run the query.
# See below how the params dictionary is created. As the fields about the query are already in memory, we should
# update the field q in the params dictionary with the query string and run the query.

""" Run the cursor to export all result
params = {'cursorMark': '*',
'debugQuery': 'true',
'fl': 'title,author,id,shard,score',
'q': "{!edismax mm='100%' tie='0.9' qf='title^500000'} health",
'rows': 500,
'sort': 'id asc',
'wt': 'json'}
The cursorMark parameter is used to keep track of the current position in the result set.
:param query: str, query string
:param conf_query:
:param query_config_path:
:param query_string: str, query string
:return: generator
"""

params = default_solr_params(self.environment)
params["cursorMark"] = "*"
# TODO: Implement the feature to access to Solr debug using this python script
params["debugQuery"] = "true"
params["q"] = make_query(query)
params["q"] = make_query(query_string, query_config_path, conf_query=conf_query)

while True:
results = self.send_query(params) # send_query
Expand Down Expand Up @@ -179,9 +204,13 @@ def get_solr_status(self):
if args.solr_host:
solr_url = f"{args.solr_host}/solr/{args.collection_name}"
else: # Use the default solr url, depending on the environment. If prod environment, use shards
solr_url = SOLR_URL[args.env]
solr_url = FULL_TEXT_SOLR_URL[args.env]
solr_exporter = SolrExporter(solr_url, args.env,
user=os.getenv("SOLR_USER"), password=os.getenv("SOLR_PASSWORD"))

query_config_file_path = os.path.join(os.path.abspath(os.path.join(parent)),
'config_files', 'full_text_search', 'config_query.yaml')

# '"good"'
for x in solr_exporter.run_cursor(args.query):
for x in solr_exporter.run_cursor(args.query, query_config_path=query_config_file_path, conf_query="ocr"):
print(x)
4 changes: 2 additions & 2 deletions ht_full_text_search/ht_full_text_searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import inspect
from argparse import ArgumentParser

from config_search import SOLR_URL
from config_search import FULL_TEXT_SOLR_URL
from ht_full_text_search.ht_full_text_query import HTFullTextQuery
from ht_searcher.ht_searcher import HTSearcher
from typing import Text, List, Dict
Expand Down Expand Up @@ -110,7 +110,7 @@ def retrieve_documents_from_file(self, q_string: Text = None, fl: List = None,
if args.solr_url:
solr_url = args.solr_url
else: # Use the default solr url, depending on the environment. If prod environment, use shards
solr_url = SOLR_URL[args.env]
solr_url = FULL_TEXT_SOLR_URL[args.env]

solr_user = os.getenv("SOLR_USER")
solr_password = os.getenv("SOLR_PASSWORD")
Expand Down
2 changes: 1 addition & 1 deletion ht_full_text_search/ht_full_text_searcher_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
class TestHTFullTextSearcher:
def test_search(self, ht_full_text_query):
searcher = HTFullTextSearcher(
solr_url=config_search.SOLR_URL["dev"],
solr_url=config_search.FULL_TEXT_SOLR_URL["dev"],
ht_search_query=ht_full_text_query,
user=os.getenv("SOLR_USER"),
password=os.getenv("SOLR_PASSWORD")
Expand Down
2 changes: 1 addition & 1 deletion ht_searcher/ht_searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def send_query(self, params):
# In chunked transfer, the data stream is divided into a series of non-overlapping "chunks".

response = requests.post(
url=self.solr_url, params=params, headers=self.headers, stream=True, auth=self.auth
url=f"{self.solr_url}/query", params=params, headers=self.headers, stream=True, auth=self.auth
)


Expand Down
2 changes: 1 addition & 1 deletion ht_searcher/ht_searcher_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def ht_searcher_fixture():
"""

return HTSearcher(
solr_url=config_search.SOLR_URL["dev"],
solr_url=config_search.FULL_TEXT_SOLR_URL["dev"],
environment="dev",
user = os.getenv("SOLR_USER"),
password = os.getenv("SOLR_PASSWORD")
Expand Down
55 changes: 32 additions & 23 deletions ht_solr_monitoring/solr_query_monitoring.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import inspect
import os
import sys
from argparse import ArgumentParser

import requests
Expand All @@ -8,8 +10,12 @@

from requests.auth import HTTPBasicAuth

from config_search import FULL_TEXT_SOLR_URL, CATALOG_SOLR_URL
from ht_full_text_search.export_all_results import SolrExporter

current = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent = os.path.dirname(current)
sys.path.insert(0, parent)

def send_solr_query(solr_base_url: str, query: dict = None,
user: str = None, password: str = None, response_times: list = None, error_count: int = 0,
Expand Down Expand Up @@ -79,8 +85,6 @@ def main():

args = parser.parse_args()

solr_base_url = f"{args.solr_host}/solr/{args.collection_name}" # http://localhost:8983/solr/core-x/query

# Set the experiment parameters
INTERVAL = 1 # seconds
TEST_DURATION = 60 # seconds
Expand All @@ -91,33 +95,38 @@ def main():
print("Starting Solr Query Performance Test...")
start_time = time.time()

# Default parameters are for full-text search
solr_host = FULL_TEXT_SOLR_URL[args.env]
config_files = 'full_text_search'
conf_query = "ocr"

# Overwrite default parameter for Catalog search
if args.cluster_name == "catalog":
solr_host = CATALOG_SOLR_URL[args.env]
config_files = 'catalog_search'
conf_query = "titleonly"

if args.solr_host:
solr_base_url = f"{args.solr_host}/solr/{args.collection_name}"
else:
solr_base_url = f"{solr_host}/solr/{args.collection_name}"


while time.time() - start_time < TEST_DURATION:
# TODO: Generate a random Solr query using different kind of queries and parameters
# Query by id,
# Query that involves different shards by title, query by author, query by date, query by source
# Faceted search

# Catalog
if args.cluster_name == "catalog":
query = {
"q": "title:Opera OR title:Shakespeare OR title:Science",
"sort": "id desc",
"rows": 1000
}
print(query)


send_solr_query(solr_base_url, query, os.getenv("SOLR_USER"), os.getenv("SOLR_PASSWORD")
, response_times,
error_count, total_queries)
# Full-text
if args.cluster_name == "fulltext":
query = "health"
solr_exporter = SolrExporter(solr_base_url, args.env,
user=os.getenv("SOLR_USER"), password=os.getenv("SOLR_PASSWORD"))
# '"good"'
for x in solr_exporter.run_cursor(query):
print(x)
query_config_file_path = os.path.join(os.path.abspath(os.path.join(parent)),
'config_files', config_files, 'config_query.yaml')

query = "health"
solr_exporter = SolrExporter(solr_base_url, args.env,
user=os.getenv("SOLR_USER"), password=os.getenv("SOLR_PASSWORD"))
# '"good"'
for x in solr_exporter.run_cursor(query, query_config_file_path, conf_query=conf_query):
print(x)

time.sleep(INTERVAL)

Expand Down
25 changes: 21 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
import argparse
import sys
import os
import inspect

from contextlib import asynccontextmanager
from fastapi.responses import StreamingResponse

import uvicorn
from fastapi import FastAPI

from config_search import SOLR_URL
from config_search import FULL_TEXT_SOLR_URL
from ht_full_text_search.export_all_results import SolrExporter

exporter_api = {}

# Add the parent directory ~/ht_full_text_search into the PYTHONPATH.
current = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
#parent = os.path.dirname(current)
sys.path.insert(0, current)

def main():
parser = argparse.ArgumentParser()
Expand All @@ -31,8 +37,9 @@ async def lifespan(app: FastAPI):
exporter_api['obj'] = SolrExporter(args.solr_url, args.env, user=os.getenv("SOLR_USER"),
password=os.getenv("SOLR_PASSWORD"))
else:
exporter_api['obj'] = SolrExporter(SOLR_URL[args.env], args.env, user=os.getenv("SOLR_USER"),
password=os.getenv("SOLR_PASSWORD"))
exporter_api['obj'] = SolrExporter(FULL_TEXT_SOLR_URL[args.env], args.env, user=os.getenv("SOLR_USER"),
password=os.getenv("SOLR_PASSWORD"))

yield
# Add some logic here to close the connection
app = FastAPI(title="HT_FullTextSearchAPI", description="Search phrases in Solr full text index", lifespan=lifespan)
Expand All @@ -50,7 +57,17 @@ def solr_query_phrases(query):
:param query: phrase to search
:return: JSON with the results
"""
return StreamingResponse(exporter_api['obj'].run_cursor(query), media_type="application/json")

# TODO: run_cursor, should receive the query_string and the query_type (ocr or all).
# When the API is started the config file is loaded in memory,
# so the query type can be used to select the kind of query to run and the params dict is updated with the query
# string.

query_config_file_path = os.path.join(os.path.abspath(os.path.join(current)),
'config_files', 'full_text_search', 'config_query.yaml')

return StreamingResponse(exporter_api['obj'].run_cursor(query, query_config_path=query_config_file_path,
conf_query="ocr"), media_type="application/json")

uvicorn.run(app, host="0.0.0.0", port=8000)

Expand Down
4 changes: 2 additions & 2 deletions scripts/generate_query_results_in_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import pandas as pd

from config_search import SOLR_URL
from config_search import FULL_TEXT_SOLR_URL
from ht_full_text_search.ht_full_text_query import HTFullTextQuery
from ht_full_text_search.ht_full_text_searcher import HTFullTextSearcher

Expand Down Expand Up @@ -86,7 +86,7 @@ def get_list_phrases(file_path: str) -> list:
if args.solr_url:
solr_url = args.solr_url
else: # Use the default solr url, depending on the environment. If prod environment, use shards
solr_url = SOLR_URL[args.env]
solr_url = FULL_TEXT_SOLR_URL[args.env]

solr_user = os.getenv("SOLR_USER")
solr_password = os.getenv("SOLR_PASSWORD")
Expand Down
Loading

0 comments on commit 9a5f570

Please sign in to comment.