From f6dbe898de23f2ae820f398203112671f309d3f9 Mon Sep 17 00:00:00 2001 From: Lianet Sepulveda Torres Date: Thu, 23 Jan 2025 16:49:38 -0500 Subject: [PATCH] Script to run queries in Catalog and fulltext search in Kubernetes with the porpose of monitoring the solr server --- README.md | 18 +++ ht_full_text_search/export_all_results.py | 27 +++- ht_solr_monitoring/__init__.py | 0 ht_solr_monitoring/solr_query_monitoring.py | 130 ++++++++++++++++++++ 4 files changed, 171 insertions(+), 4 deletions(-) create mode 100644 ht_solr_monitoring/__init__.py create mode 100644 ht_solr_monitoring/solr_query_monitoring.py diff --git a/README.md b/README.md index ee8362c..6e819cb 100644 --- a/README.md +++ b/README.md @@ -383,6 +383,24 @@ application outside the docker and using the command below: Note: To find all the distribution by rights categories, you can query the table `attributes` in MySql database. +**Phase 3**: Create scripts to monitor the Solr servers and the application. + +- Create the script `~/ht_solr_monitoring/solr_query_monitoring.py` to generate queries to test the performance of the Solr cluster. + +**Use case 1**: Downloading large result sets (over 20,000 items) using cursor-based paging and using the Solr export handler. +**Use case 2**: Create different kinds of Solr queries to measure their performance. + +- These use cases are relevant because we want to monitor the query performance of our different Solr clusters. +We have created some Grafana dashboards, and we need the flexibility to add different kinds of queries and see how the cluster +works. + +The main logic to create the queries is in the script `ht_solr_monitoring/solr_query_monitoring.py`. Currently, there are two different +alternatives to query Catalog and Full Text Solr servers. We can run the script with the following parameters. +For Catalog, you should change --cluster_name to catalog. The script allows you to pass the name of the collection, +the environment and the solr host. If you run the script on your local environment the solr host will be http://localhost:8983. + +```docker compose exec full_text_searcher python ht_solr_monitoring/solr_query_monitoring.py --solr_host http://solr-lss-dev:8983 --collection_name core-x --env dev --cluster_name fulltext``` + ## Tests - This application is tested using the pytest library. - To run the tests, you can use the command `pytest` in the terminal. diff --git a/ht_full_text_search/export_all_results.py b/ht_full_text_search/export_all_results.py index e45df35..47e743c 100644 --- a/ht_full_text_search/export_all_results.py +++ b/ht_full_text_search/export_all_results.py @@ -29,6 +29,22 @@ # If you want to do a phrase query, be sure to surround it in double quotes, e.g. # poetry run python3 ht_full_text_search/export_all_results.py '"a phrase"' +# TODO: ht_full_text_search should change to become the python library we use for quering our Solr clusters. Right now, +# the code is implemented to run queries only in the full text search cluster. We should have a more generic way to +# query any Solr cluster we have, including the catalog ones. +# We should have a way to configure the Solr cluster we want to query, the environment, the collection, etc. +# We should have a way to configure the fields we want to return in the query results +# We should have a way to configure the fields we want to use in the query +# We should have a way to configure the fields we want to use in the query to boost the results + +# We should have generic classes to Search, make queries, filters and facets and print the query results. +# We should create specific classes (catalog => catalog-api, catalog-monitoring, fulltext => fulltext-api, +# fulltext-monitoring) children of the generic ones that have their own ways to make queries + +# TODO: Implement the class to manage Solr query results. +# Specify the fields to show in the query result +# Specify if the Solr debug output will be show.Create our onw debug dictionary with fields we decide, +# e.g. QTime, status, shards, etc. def process_results(item: dict) -> str: """ Prepare the dictionary with Solr results to be exported as JSON """ @@ -78,7 +94,7 @@ def __init__(self, solr_url: str, env: str, user=None, password=None): :param env: str, environment. It could be dev or prod """ - self.solr_url = solr_url + self.solr_url = f"{solr_url}/query" self.environment = env self.headers = {"Content-Type": "application/json"} self.auth = HTTPBasicAuth(user, password) if user and password else None @@ -110,6 +126,8 @@ def run_cursor(self, query): params = default_solr_params(self.environment) params["cursorMark"] = "*" + # TODO: Implement the feature to access to Solr debug using this python script + params["debugQuery"] = "true" params["q"] = make_query(query) while True: @@ -151,14 +169,15 @@ def get_solr_status(self): parser = ArgumentParser() parser.add_argument("--env", default=os.environ.get("HT_ENVIRONMENT", "dev")) - parser.add_argument("--solr_url", help="Solr url", default=None) + parser.add_argument("--solr_host", help="Solr host", default=None) + parser.add_argument("--collection_name", help="Name of the collection", default=None) parser.add_argument('--query', help='Query string', required=True) args = parser.parse_args() # Receive as a parameter an specific solr url - if args.solr_url: - solr_url = args.solr_url + if args.solr_host: + solr_url = f"{args.solr_host}/solr/{args.collection_name}" else: # Use the default solr url, depending on the environment. If prod environment, use shards solr_url = SOLR_URL[args.env] solr_exporter = SolrExporter(solr_url, args.env, diff --git a/ht_solr_monitoring/__init__.py b/ht_solr_monitoring/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ht_solr_monitoring/solr_query_monitoring.py b/ht_solr_monitoring/solr_query_monitoring.py new file mode 100644 index 0000000..b88ec0b --- /dev/null +++ b/ht_solr_monitoring/solr_query_monitoring.py @@ -0,0 +1,130 @@ +import os +from argparse import ArgumentParser + +import requests +import time +import json +from statistics import mean, median + +from requests.auth import HTTPBasicAuth + +from ht_full_text_search.export_all_results import SolrExporter + + +def send_solr_query(solr_base_url: str, query: dict = None, + user: str = None, password: str = None, response_times: list = None, error_count: int = 0, + total_queries: int = 0) -> None: + """ + Send a query to Solr and measure the response time. + :param solr_base_url: + :param collection_name: + :return: + """ + + try: + # Construct Solr query URL + url = f"{solr_base_url}/select" + + # Record start time + start_time = time.time() + + auth = HTTPBasicAuth(user, password) if user and password else None + + # Send query request + response = requests.get(url, params=query, timeout=10, auth=auth) + output = json.loads(response.content) + + for result in output['response']['docs']: + print(result) + + # Record end time + end_time = time.time() + + # Calculate response time + response_time = end_time - start_time + response_times.append(response_time) + + # Check for errors + if response.status_code != 200: + print(f"Error: Received status code {response.status_code}") + error_count += 1 + else: + print(f"Query successful. Response time: {response_time:.3f} seconds") + except Exception as e: + print(f"Error: {e}") + error_count += 1 + finally: + total_queries += 1 + +def print_metrics(response_times: list, error_count: int, total_queries: int) -> None: + print("\n=== Solr Query Performance Metrics ===") + print(f"Total Queries: {total_queries}") + print(f"Errors: {error_count}") + if response_times: + print(f"Average Response Time: {mean(response_times):.3f} seconds") + print(f"Median Response Time: {median(response_times):.3f} seconds") + print(f"Fastest Query Time: {min(response_times):.3f} seconds") + print(f"Slowest Query Time: {max(response_times):.3f} seconds") + else: + print("No successful queries recorded.") + print("======================================\n") + +def main(): + + parser = ArgumentParser() + parser.add_argument("--env", default=os.environ.get("HT_ENVIRONMENT", "dev")) + parser.add_argument("--solr_host", help="Solr url", default=None) + parser.add_argument("--collection_name", help="Name of the collection", default=None) + parser.add_argument("--cluster_name", help="It can be catalog or fulltext", required=True) + + args = parser.parse_args() + + solr_base_url = f"{args.solr_host}/solr/{args.collection_name}" # http://localhost:8983/solr/core-x/query + + # Set the experiment parameters + INTERVAL = 1 # seconds + TEST_DURATION = 60 # seconds + response_times = [] + error_count = 0 + total_queries = 0 + + print("Starting Solr Query Performance Test...") + start_time = time.time() + + while time.time() - start_time < TEST_DURATION: + # TODO: Generate a random Solr query using different kind of queries and parameters + # Query by id, + # Query that involves different shards by title, query by author, query by date, query by source + # Faceted search + + # Catalog + if args.cluster_name == "catalog": + query = { + "q": "title:Opera OR title:Shakespeare OR title:Science", + "sort": "id desc", + "rows": 1000 + } + print(query) + + + send_solr_query(solr_base_url, query, os.getenv("SOLR_USER"), os.getenv("SOLR_PASSWORD") + , response_times, + error_count, total_queries) + # Full-text + if args.cluster_name == "fulltext": + query = "health" + solr_exporter = SolrExporter(solr_base_url, args.env, + user=os.getenv("SOLR_USER"), password=os.getenv("SOLR_PASSWORD")) + # '"good"' + for x in solr_exporter.run_cursor(query): + print(x) + + time.sleep(INTERVAL) + + # Only for Catalog queries + print_metrics(response_times, error_count, total_queries) + + + +if __name__ == "__main__": + main() \ No newline at end of file