hathitrust · liseli · Jan 31, 2025 · Jan 23, 2025
diff --git a/README.md b/README.md
@@ -383,6 +383,24 @@ application outside the docker and using the command below:
 
 Note: To find all the distribution by rights categories, you can query the table `attributes` in MySql database.
 
+**Phase 3**: Create scripts to monitor the Solr servers and the application.
+
+- Create the script `~/ht_solr_monitoring/solr_query_monitoring.py` to generate queries to test the performance of the Solr cluster.
+
+**Use case 1**: Downloading large result sets (over 20,000 items) using cursor-based paging and using the Solr export handler.
+**Use case 2**: Create different kinds of Solr queries to measure their performance.
+
+- These use cases are relevant because we want to monitor the query performance of our different Solr clusters. 
+We have created some Grafana dashboards, and we need the flexibility to add different kinds of queries and see how the cluster 
+works. 
+
+The main logic to create the queries is in the script `ht_solr_monitoring/solr_query_monitoring.py`. Currently, there are two different
+alternatives to query Catalog and Full Text Solr servers. We can run the script with the following parameters. 
+For Catalog, you should change --cluster_name to catalog. The script allows you to pass the name of the collection, 
+the environment and the solr host. If you run the script on your local environment the solr host will be http://localhost:8983.
+
+```docker compose exec full_text_searcher python ht_solr_monitoring/solr_query_monitoring.py --solr_host http://solr-lss-dev:8983 --collection_name core-x --env dev --cluster_name fulltext```
+
 ## Tests
 - This application is tested using the pytest library.
 - To run the tests, you can use the command `pytest` in the terminal.

diff --git a/ht_full_text_search/export_all_results.py b/ht_full_text_search/export_all_results.py
@@ -29,6 +29,22 @@
 # If you want to do a phrase query, be sure to surround it in double quotes, e.g.
 # poetry run python3 ht_full_text_search/export_all_results.py '"a phrase"'
 
+# TODO: ht_full_text_search should change to become the python library we use for quering our Solr clusters. Right now,
+# the code is implemented to run queries only in the full text search cluster. We should have a more generic way to
+# query any Solr cluster we have, including the catalog ones.
+# We should have a way to configure the Solr cluster we want to query, the environment, the collection, etc.
+# We should have a way to configure the fields we want to return in the query results
+# We should have a way to configure the fields we want to use in the query
+# We should have a way to configure the fields we want to use in the query to boost the results
+
+# We should have generic classes to Search, make queries, filters and facets and print the query results.
+# We should create specific classes (catalog => catalog-api, catalog-monitoring, fulltext => fulltext-api,
+# fulltext-monitoring) children of the generic ones that have their own ways to make queries
+
+# TODO: Implement the class to manage Solr query results.
+# Specify the fields to show in the query result
+# Specify if the Solr debug output will be show.Create our onw debug dictionary with fields we decide,
+# e.g. QTime, status, shards, etc.
 def process_results(item: dict) -> str:
 
     """ Prepare the dictionary with Solr results to be exported as JSON """
@@ -78,7 +94,7 @@ def __init__(self, solr_url: str, env: str, user=None, password=None):
         :param env: str, environment. It could be dev or prod
         """
 
-        self.solr_url = solr_url
+        self.solr_url = f"{solr_url}/query"
         self.environment = env
         self.headers = {"Content-Type": "application/json"}
         self.auth = HTTPBasicAuth(user, password) if user and password else None
@@ -110,6 +126,8 @@ def run_cursor(self, query):
 
         params = default_solr_params(self.environment)
         params["cursorMark"] = "*"
+        # TODO: Implement the feature to access to Solr debug using this python script
+        params["debugQuery"] = "true"
         params["q"] = make_query(query)
 
         while True:
@@ -151,14 +169,15 @@ def get_solr_status(self):
 
     parser = ArgumentParser()
     parser.add_argument("--env", default=os.environ.get("HT_ENVIRONMENT", "dev"))
-    parser.add_argument("--solr_url", help="Solr url", default=None)
+    parser.add_argument("--solr_host", help="Solr host", default=None)
+    parser.add_argument("--collection_name", help="Name of the collection", default=None)
     parser.add_argument('--query', help='Query string', required=True)
 
     args = parser.parse_args()
 
     # Receive as a parameter an specific solr url
-    if args.solr_url:
-        solr_url = args.solr_url
+    if args.solr_host:
+        solr_url = f"{args.solr_host}/solr/{args.collection_name}"
     else:  # Use the default solr url, depending on the environment. If prod environment, use shards
         solr_url = SOLR_URL[args.env]
     solr_exporter = SolrExporter(solr_url, args.env,

diff --git a/ht_solr_monitoring/__init__.py b/ht_solr_monitoring/__init__.py
diff --git a/ht_solr_monitoring/solr_query_monitoring.py b/ht_solr_monitoring/solr_query_monitoring.py
@@ -0,0 +1,130 @@
+import os
+from argparse import ArgumentParser
+
+import requests
+import time
+import json
+from statistics import mean, median
+
+from requests.auth import HTTPBasicAuth
+
+from ht_full_text_search.export_all_results import SolrExporter
+
+
+def send_solr_query(solr_base_url: str, query: dict = None,
+                    user: str = None, password: str = None, response_times: list = None, error_count: int = 0,
+                    total_queries: int = 0) -> None:
+    """
+    Send a query to Solr and measure the response time.
+    :param solr_base_url:
+    :param collection_name:
+    :return:
+    """
+
+    try:
+        # Construct Solr query URL
+        url = f"{solr_base_url}/select"
+
+        # Record start time
+        start_time = time.time()
+
+        auth = HTTPBasicAuth(user, password) if user and password else None
+
+        # Send query request
+        response = requests.get(url, params=query, timeout=10, auth=auth)
+        output = json.loads(response.content)
+
+        for result in output['response']['docs']:
+            print(result)
+
+        # Record end time
+        end_time = time.time()
+
+        # Calculate response time
+        response_time = end_time - start_time
+        response_times.append(response_time)
+
+        # Check for errors
+        if response.status_code != 200:
+            print(f"Error: Received status code {response.status_code}")
+            error_count += 1
+        else:
+            print(f"Query successful. Response time: {response_time:.3f} seconds")
+    except Exception as e:
+        print(f"Error: {e}")
+        error_count += 1
+    finally:
+        total_queries += 1
+
+def print_metrics(response_times: list, error_count: int, total_queries: int) -> None:
+    print("\n=== Solr Query Performance Metrics ===")
+    print(f"Total Queries: {total_queries}")
+    print(f"Errors: {error_count}")
+    if response_times:
+        print(f"Average Response Time: {mean(response_times):.3f} seconds")
+        print(f"Median Response Time: {median(response_times):.3f} seconds")
+        print(f"Fastest Query Time: {min(response_times):.3f} seconds")
+        print(f"Slowest Query Time: {max(response_times):.3f} seconds")
+    else:
+        print("No successful queries recorded.")
+    print("======================================\n")
+
+def main():
+
+    parser = ArgumentParser()
+    parser.add_argument("--env", default=os.environ.get("HT_ENVIRONMENT", "dev"))
+    parser.add_argument("--solr_host", help="Solr url", default=None)
+    parser.add_argument("--collection_name", help="Name of the collection", default=None)
+    parser.add_argument("--cluster_name", help="It can be catalog or fulltext", required=True)
+
+    args = parser.parse_args()
+
+    solr_base_url = f"{args.solr_host}/solr/{args.collection_name}" # http://localhost:8983/solr/core-x/query
+
+    # Set the experiment parameters
+    INTERVAL = 1  # seconds
+    TEST_DURATION = 60  # seconds
+    response_times = []
+    error_count = 0
+    total_queries = 0
+
+    print("Starting Solr Query Performance Test...")
+    start_time = time.time()
+
+    while time.time() - start_time < TEST_DURATION:
+        # TODO: Generate a random Solr query using different kind of queries and parameters
+        # Query by id,
+        # Query that involves different shards by title, query by author, query by date, query by source
+        # Faceted search
+
+        # Catalog
+        if args.cluster_name == "catalog":
+            query = {
+                "q": "title:Opera OR title:Shakespeare OR title:Science",
+                "sort": "id desc",
+                "rows": 1000
+            }
+            print(query)
+
+
+            send_solr_query(solr_base_url, query, os.getenv("SOLR_USER"), os.getenv("SOLR_PASSWORD")
+                            , response_times,
+                            error_count, total_queries)
+        # Full-text
+        if args.cluster_name == "fulltext":
+            query = "health"
+            solr_exporter = SolrExporter(solr_base_url, args.env,
+                                         user=os.getenv("SOLR_USER"), password=os.getenv("SOLR_PASSWORD"))
+            # '"good"'
+            for x in solr_exporter.run_cursor(query):
+                print(x)
+
+        time.sleep(INTERVAL)
+
+    # Only for Catalog queries
+    print_metrics(response_times, error_count, total_queries)
+
+
+
+if __name__ == "__main__":
+    main()