diff --git a/LICENSE b/LICENSE index 1baa903..0c5bd3d 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2024 OpenHands +Copyright (c) 2024 Minesh A. Jethva Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index c904c53..4f36c36 100644 --- a/README.md +++ b/README.md @@ -102,7 +102,7 @@ If you use this project in your research, please cite: ```bibtex @software{timeseries_rag2024, - author = {OpenHands}, + author = {Jethva, Minesh A.}, title = {Time Series RAG}, year = {2024}, publisher = {GitHub}, diff --git a/docs/docs/source/conf.py b/docs/docs/source/conf.py index 56e8fde..4f8f4e0 100644 --- a/docs/docs/source/conf.py +++ b/docs/docs/source/conf.py @@ -3,8 +3,8 @@ sys.path.insert(0, os.path.abspath('../../..')) project = 'Time Series RAG' -copyright = '2024, OpenHands' -author = 'OpenHands' +copyright = '2024, Minesh A. Jethva' +author = 'Minesh A. Jethva' extensions = [ 'sphinx.ext.autodoc', diff --git a/docs/docs/source/examples/usage.rst b/docs/docs/source/examples/usage.rst new file mode 100644 index 0000000..714de1c --- /dev/null +++ b/docs/docs/source/examples/usage.rst @@ -0,0 +1,84 @@ +Usage Examples +============= + +Basic Usage +---------- + +Here are some examples of how to use the Time Series RAG system: + +Using the Python API +~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from timeseries_rag.models import TimeSeriesEmbedder + from timeseries_rag.rag import TimeSeriesRAG, TimeSeriesDocument + import numpy as np + + # Initialize components + embedder = TimeSeriesEmbedder(target_length=256) + rag = TimeSeriesRAG() + + # Create some example time series + t = np.linspace(0, 10, 100) + sine_wave = np.sin(t) + noisy_sine = sine_wave + np.random.normal(0, 0.2, size=len(sine_wave)) + + # Embed and store the first time series + embedding = embedder.embed(sine_wave) + doc = TimeSeriesDocument( + id="sine_1", + data=sine_wave, + metadata={"type": "sine", "frequency": 1.0}, + embedding=embedding + ) + rag.add_document(doc) + + # Search for similar patterns + query_embedding = embedder.embed(noisy_sine) + results = rag.search(query_embedding, k=5) + + # Print results + for result in results: + print(f"Document ID: {result['id']}") + print(f"Distance: {result['distance']:.4f}") + print(f"Metadata: {result['metadata']}") + print() + +Using the Web API +~~~~~~~~~~~~~~~ + +.. code-block:: python + + import requests + import pandas as pd + import numpy as np + + # Create example data + t = np.linspace(0, 10, 100) + sine_wave = np.sin(t) + + # Save to CSV + pd.DataFrame(sine_wave).to_csv('sine.csv', index=False) + + # Upload time series + files = {'file': open('sine.csv', 'rb')} + metadata = '{"type": "sine", "frequency": 1.0}' + response = requests.post( + 'http://localhost:50758/upload', + files=files, + data={'metadata': metadata} + ) + print(response.json()) + + # Search for similar patterns + noisy_sine = sine_wave + np.random.normal(0, 0.2, size=len(sine_wave)) + pd.DataFrame(noisy_sine).to_csv('query.csv', index=False) + + files = {'file': open('query.csv', 'rb')} + response = requests.post( + 'http://localhost:50758/search', + files=files, + params={'k': 5} + ) + print(response.json()) \ No newline at end of file diff --git a/docs/docs/source/modules/api.rst b/docs/docs/source/modules/api.rst new file mode 100644 index 0000000..42d1e57 --- /dev/null +++ b/docs/docs/source/modules/api.rst @@ -0,0 +1,11 @@ +Web API +======= + +FastAPI Web Application +--------------------- + +.. automodule:: timeseries_rag.api + :members: + :undoc-members: + :show-inheritance: + :special-members: __init__ \ No newline at end of file diff --git a/docs/docs/source/modules/models.rst b/docs/docs/source/modules/models.rst new file mode 100644 index 0000000..6291d4b --- /dev/null +++ b/docs/docs/source/modules/models.rst @@ -0,0 +1,11 @@ +Models +====== + +Time Series Embedding +-------------------- + +.. automodule:: timeseries_rag.models + :members: + :undoc-members: + :show-inheritance: + :special-members: __init__ \ No newline at end of file diff --git a/docs/docs/source/modules/rag.rst b/docs/docs/source/modules/rag.rst new file mode 100644 index 0000000..e5cac96 --- /dev/null +++ b/docs/docs/source/modules/rag.rst @@ -0,0 +1,11 @@ +RAG System +========== + +Time Series RAG Implementation +---------------------------- + +.. automodule:: timeseries_rag.rag + :members: + :undoc-members: + :show-inheritance: + :special-members: __init__ \ No newline at end of file diff --git a/docs/docs/source/readme.rst b/docs/docs/source/readme.rst new file mode 100644 index 0000000..2d639e2 --- /dev/null +++ b/docs/docs/source/readme.rst @@ -0,0 +1,5 @@ +Overview +======== + +.. include:: ../../../README.md + :parser: myst_parser.sphinx_ \ No newline at end of file diff --git a/setup.py b/setup.py index a88326d..4cae7a4 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name="timeseries_rag", version="0.1.0", - author="OpenHands", + author="Minesh A. Jethva", author_email="minesh.1291@gmail.com", description="Time series similarity search and retrieval augmented generation", long_description=long_description, diff --git a/src/timeseries_rag/api.py b/src/timeseries_rag/api.py index 6deec4a..c18af2a 100644 --- a/src/timeseries_rag/api.py +++ b/src/timeseries_rag/api.py @@ -1,3 +1,28 @@ +"""FastAPI Web Application for Time Series RAG. + +This module provides a web interface and REST API for the Time Series RAG system. +It allows users to upload time series data, add metadata, and search for similar +patterns through a user-friendly interface. + +The module includes: +- REST API endpoints for uploading and searching time series +- Interactive web interface with Plotly visualizations +- CORS middleware for cross-origin requests +- File upload handling for CSV data +- Error handling and validation + +Example: + To run the web application: + + ```python + import uvicorn + from timeseries_rag.api import app + + if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=50758) + ``` +""" + from fastapi import FastAPI, UploadFile, File, HTTPException from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import HTMLResponse @@ -6,13 +31,18 @@ import numpy as np import pandas as pd import json -from typing import List, Dict +from typing import List, Dict, Optional import uuid -from models import TimeSeriesEmbedder -from rag import TimeSeriesRAG, TimeSeriesDocument +from .models import TimeSeriesEmbedder +from .rag import TimeSeriesRAG, TimeSeriesDocument -app = FastAPI() +# Create FastAPI application +app = FastAPI( + title="Time Series RAG", + description="Time series similarity search and retrieval augmented generation", + version="0.1.0" +) # Add CORS middleware app.add_middleware( @@ -30,8 +60,42 @@ @app.post("/upload") async def upload_timeseries( file: UploadFile = File(...), - metadata: str = None -): + metadata: Optional[str] = None +) -> Dict[str, str]: + """Upload a time series file with optional metadata. + + This endpoint accepts a CSV file containing time series data and optional + metadata in JSON format. The time series is embedded and stored in the RAG + system for later retrieval. + + Args: + file (UploadFile): CSV file containing time series data. Should have + one or more columns of numerical values. + metadata (Optional[str], optional): JSON string containing metadata + about the time series. Defaults to None. + + Returns: + Dict[str, str]: A dictionary containing: + - status: "success" if upload was successful + - document_id: UUID of the stored document + + Raises: + HTTPException: If file reading, parsing, or storage fails. + + Example: + ```python + import requests + + files = {'file': open('timeseries.csv', 'rb')} + metadata = '{"type": "temperature", "location": "sensor1"}' + response = requests.post( + 'http://localhost:50758/upload', + files=files, + data={'metadata': metadata} + ) + print(response.json()) + ``` + """ try: content = await file.read() df = pd.read_csv(content) @@ -62,7 +126,40 @@ async def upload_timeseries( async def search_similar( file: UploadFile = File(...), k: int = 5 -): +) -> Dict[str, List[Dict]]: + """Search for similar time series patterns. + + This endpoint accepts a CSV file containing a query time series and returns + the k most similar time series from the database. + + Args: + file (UploadFile): CSV file containing the query time series data. + k (int, optional): Number of similar patterns to retrieve. Defaults to 5. + + Returns: + Dict[str, List[Dict]]: A dictionary containing: + - results: List of similar time series, each with: + - id: Document ID + - distance: L2 distance to query + - data: Time series values + - metadata: Document metadata + + Raises: + HTTPException: If file reading, parsing, or search fails. + + Example: + ```python + import requests + + files = {'file': open('query.csv', 'rb')} + response = requests.post( + 'http://localhost:50758/search', + files=files, + params={'k': 10} + ) + print(response.json()) + ``` + """ try: content = await file.read() df = pd.read_csv(content) @@ -79,7 +176,12 @@ async def search_similar( raise HTTPException(status_code=400, detail=str(e)) @app.get("/", response_class=HTMLResponse) -async def root(): +async def root() -> str: + """Serve the main web interface. + + Returns: + str: HTML content for the web interface. + """ return """ @@ -166,10 +268,18 @@ async def root(): """ -if __name__ == "__main__": +def main(): + """Run the FastAPI application using uvicorn. + + This function is the entry point for running the web application. It configures + uvicorn with the appropriate host and port settings. + """ uvicorn.run( - "app:app", + "timeseries_rag.api:app", host="0.0.0.0", port=50758, reload=True - ) \ No newline at end of file + ) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/timeseries_rag/models.py b/src/timeseries_rag/models.py index 232d735..d76e069 100644 --- a/src/timeseries_rag/models.py +++ b/src/timeseries_rag/models.py @@ -1,20 +1,84 @@ +"""Time Series Embedding Module. + +This module provides functionality for converting time series data into fixed-length +embeddings using a combination of resampling and statistical features. The embeddings +can be used for similarity search and retrieval tasks. + +Example: + >>> embedder = TimeSeriesEmbedder(target_length=256) + >>> time_series = [1.0, 2.0, 3.0, 2.0, 1.0] + >>> embedding = embedder.embed(time_series) + >>> print(embedding.shape) + (1, 260) # 256 resampled points + 4 statistical features +""" + import numpy as np from sklearn.preprocessing import StandardScaler from scipy.signal import resample +from typing import Union, List, Tuple class TimeSeriesEmbedder: - def __init__(self, target_length=256): + """A class for converting time series data into fixed-length embeddings. + + This class provides functionality to transform variable-length time series into + fixed-length embeddings by combining resampled values with statistical features. + The embeddings can be used for similarity search and other downstream tasks. + + Attributes: + target_length (int): The desired length of the resampled time series. + Default is 256 points. + scaler (StandardScaler): A scikit-learn StandardScaler instance for + normalizing the time series data. + + Example: + >>> embedder = TimeSeriesEmbedder(target_length=128) + >>> time_series = np.sin(np.linspace(0, 10, 1000)) + >>> embedding = embedder.embed(time_series) + >>> print(f"Embedding shape: {embedding.shape}") + """ + + def __init__(self, target_length: int = 256): + """Initialize the TimeSeriesEmbedder. + + Args: + target_length (int, optional): The desired length of the resampled + time series. Defaults to 256. + """ self.target_length = target_length self.scaler = StandardScaler() - def embed(self, time_series): - """Convert time series to embedding vector using resampling and statistical features""" + def embed(self, time_series: Union[List[float], np.ndarray]) -> np.ndarray: + """Convert time series to embedding vector using resampling and statistical features. + + This method performs the following steps: + 1. Converts input to numpy array if necessary + 2. Reshapes to 2D array if necessary + 3. Normalizes the time series using StandardScaler + 4. Resamples to fixed length using scipy.signal.resample + 5. Extracts statistical features (mean, std, max, min) + 6. Combines resampled values with statistical features + + Args: + time_series (Union[List[float], np.ndarray]): Input time series data. + Can be a 1D list/array or 2D array with shape (n_samples, n_features). + + Returns: + np.ndarray: A 2D array of shape (1, target_length + 4) containing the + embedding vector. The first target_length elements are the resampled + values, followed by mean, std, max, and min statistics. + + Raises: + ValueError: If the input time series is empty or has invalid dimensions. + """ if isinstance(time_series, list): time_series = np.array(time_series) if len(time_series.shape) == 1: time_series = time_series.reshape(-1, 1) + if time_series.size == 0: + raise ValueError("Input time series is empty") + # Normalize time_series = self.scaler.fit_transform(time_series) diff --git a/src/timeseries_rag/rag.py b/src/timeseries_rag/rag.py index 7d1480f..2ce9828 100644 --- a/src/timeseries_rag/rag.py +++ b/src/timeseries_rag/rag.py @@ -1,30 +1,151 @@ +"""Time Series Retrieval Augmented Generation (RAG) Module. + +This module provides functionality for storing and retrieving time series data using +vector similarity search. It implements a RAG system specifically designed for time +series data, allowing efficient storage and retrieval of similar patterns. + +Example: + >>> from timeseries_rag.models import TimeSeriesEmbedder + >>> embedder = TimeSeriesEmbedder() + >>> rag = TimeSeriesRAG() + >>> + >>> # Add a document + >>> ts_data = np.sin(np.linspace(0, 10, 100)) + >>> embedding = embedder.embed(ts_data) + >>> doc = TimeSeriesDocument( + ... id="sin_wave_1", + ... data=ts_data, + ... metadata={"type": "sine", "frequency": 1.0}, + ... embedding=embedding + ... ) + >>> rag.add_document(doc) + >>> + >>> # Search for similar patterns + >>> query = np.sin(np.linspace(0, 10, 100) + 0.1) + >>> query_embedding = embedder.embed(query) + >>> results = rag.search(query_embedding, k=5) +""" + import faiss import numpy as np from dataclasses import dataclass -from typing import List, Dict, Any +from typing import List, Dict, Any, Optional, Union @dataclass class TimeSeriesDocument: + """A dataclass representing a time series document with metadata and embedding. + + This class stores all information related to a time series, including its raw + data, metadata, and vector embedding for similarity search. + + Attributes: + id (str): Unique identifier for the time series. + data (np.ndarray): Raw time series data. + metadata (Dict[str, Any]): Additional information about the time series. + embedding (Optional[np.ndarray]): Vector embedding of the time series, + used for similarity search. Default is None. + + Example: + >>> data = np.array([1.0, 2.0, 3.0, 2.0, 1.0]) + >>> doc = TimeSeriesDocument( + ... id="example_1", + ... data=data, + ... metadata={"type": "example"}, + ... embedding=np.array([0.1, 0.2, 0.3]) + ... ) + """ + id: str data: np.ndarray metadata: Dict[str, Any] - embedding: np.ndarray = None + embedding: Optional[np.ndarray] = None class TimeSeriesRAG: - def __init__(self, embedding_dim=256): + """A class implementing Retrieval Augmented Generation for time series data. + + This class provides functionality for storing time series documents and + retrieving similar patterns using FAISS vector similarity search. + + Attributes: + embedding_dim (int): Dimension of the time series embeddings. + index (faiss.Index): FAISS index for similarity search. + documents (List[TimeSeriesDocument]): List of stored time series documents. + + Example: + >>> rag = TimeSeriesRAG(embedding_dim=260) + >>> doc = TimeSeriesDocument(...) + >>> rag.add_document(doc) + >>> results = rag.search(query_embedding, k=5) + """ + + def __init__(self, embedding_dim: int = 260): + """Initialize the TimeSeriesRAG system. + + Args: + embedding_dim (int, optional): Dimension of the time series embeddings. + Should match the output dimension of your embedding model. + Defaults to 260 (256 resampled points + 4 statistical features). + """ self.embedding_dim = embedding_dim self.index = faiss.IndexFlatL2(embedding_dim) self.documents: List[TimeSeriesDocument] = [] - def add_document(self, doc: TimeSeriesDocument): - if doc.embedding is not None: - self.index.add(doc.embedding.reshape(1, -1)) - self.documents.append(doc) + def add_document(self, doc: TimeSeriesDocument) -> None: + """Add a time series document to the RAG system. + + Args: + doc (TimeSeriesDocument): The document to add. Must have a valid + embedding for similarity search. + + Raises: + ValueError: If the document's embedding is None or has incorrect shape. + """ + if doc.embedding is None: + raise ValueError("Document must have an embedding") + + if doc.embedding.shape[-1] != self.embedding_dim: + raise ValueError( + f"Embedding dimension mismatch. Expected {self.embedding_dim}, " + f"got {doc.embedding.shape[-1]}" + ) + + self.index.add(doc.embedding.reshape(1, -1)) + self.documents.append(doc) - def search(self, query_embedding: np.ndarray, k: int = 5): + def search( + self, + query_embedding: np.ndarray, + k: int = 5 + ) -> List[Dict[str, Any]]: + """Search for similar time series patterns. + + Args: + query_embedding (np.ndarray): The embedding vector of the query time + series. Must match the embedding dimension of the index. + k (int, optional): Number of nearest neighbors to retrieve. + Defaults to 5. + + Returns: + List[Dict[str, Any]]: A list of dictionaries containing search results. + Each dictionary has the following keys: + - 'id': Document ID + - 'distance': L2 distance to query + - 'data': Raw time series data + - 'metadata': Document metadata + + Raises: + ValueError: If query_embedding has incorrect shape. + """ + if query_embedding.shape[-1] != self.embedding_dim: + raise ValueError( + f"Query embedding dimension mismatch. Expected {self.embedding_dim}, " + f"got {query_embedding.shape[-1]}" + ) + distances, indices = self.index.search( query_embedding.reshape(1, -1), k ) + results = [] for i, idx in enumerate(indices[0]): if idx < len(self.documents): @@ -37,7 +158,15 @@ def search(self, query_embedding: np.ndarray, k: int = 5): }) return results - def get_document_by_id(self, doc_id: str): + def get_document_by_id(self, doc_id: str) -> Optional[TimeSeriesDocument]: + """Retrieve a document by its ID. + + Args: + doc_id (str): The ID of the document to retrieve. + + Returns: + Optional[TimeSeriesDocument]: The document if found, None otherwise. + """ for doc in self.documents: if doc.id == doc_id: return doc