From 7009d59c2850f46a99d61d1f09eea54bbaecc451 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Picault?= Date: Mon, 27 Jan 2025 09:13:53 +0100 Subject: [PATCH] Updated train topics function for incremental topic learning Updated notebook --- bertrend/BERTrend.py | 7 +- bertrend/trend_analysis/weak_signals.py | 11 +- getting_started/bertrend_quickstart.ipynb | 2051 +++++++++++++++++++++ pyproject.toml | 3 +- 4 files changed, 2063 insertions(+), 9 deletions(-) create mode 100644 getting_started/bertrend_quickstart.ipynb diff --git a/bertrend/BERTrend.py b/bertrend/BERTrend.py index 8c9d376..c1e93da 100644 --- a/bertrend/BERTrend.py +++ b/bertrend/BERTrend.py @@ -259,12 +259,13 @@ def train_topic_models( self._is_fitted = True + # Merge the newly obtained topic models with new ones # Update topic_models: Dictionary of trained BERTopic models for each timestamp. - self.topic_models = topic_models + self.topic_models.update(topic_models) # Update doc_groups: Dictionary of document groups for each timestamp. - self.doc_groups = doc_groups + self.doc_groups.update(doc_groups) # Update emb_groups: Dictionary of document embeddings for each timestamp. - self.emb_groups = emb_groups + self.emb_groups.update(emb_groups) logger.success("Finished training all topic models") def merge_all_models( diff --git a/bertrend/trend_analysis/weak_signals.py b/bertrend/trend_analysis/weak_signals.py index ddb7255..0a278c4 100644 --- a/bertrend/trend_analysis/weak_signals.py +++ b/bertrend/trend_analysis/weak_signals.py @@ -8,6 +8,7 @@ import scipy from bertopic import BERTopic from loguru import logger +from pandas import Timestamp from bertrend.llm_utils.openai_client import OpenAI_Client from bertrend import LLM_CONFIG @@ -15,24 +16,24 @@ def detect_weak_signals_zeroshot( - topic_models: dict[pd.Timestamp, BERTopic], + topic_models: dict[Timestamp, BERTopic], zeroshot_topic_list: list[str], granularity: int, decay_factor: float = 0.01, decay_power: float = 2, -) -> dict[str, dict[pd.Timestamp, dict[str, any]]]: +) -> dict[str, dict[Timestamp, dict[str, any]]]: """ Detect weak signals based on the zero-shot list of topics to monitor. Args: - topic_models (Dict[pd.Timestamp, BERTopic]): Dictionary of BERTopic models for each timestamp. + topic_models (Dict[Timestamp, BERTopic]): Dictionary of BERTopic models for each timestamp. zeroshot_topic_list (List[str]): List of topics to monitor for weak signals. granularity (int): The granularity of the timestamps in days. decay_factor (float): The decay factor for exponential decay. decay_power (float): The decay power for exponential decay. Returns: - Dict[str, Dict[pd.Timestamp, Dict[str, any]]]: Dictionary of weak signal trends for each monitored topic. + Dict[str, Dict[Timestamp, Dict[str, any]]]: Dictionary of weak signal trends for each monitored topic. """ weak_signal_trends = {} @@ -329,7 +330,7 @@ def _apply_decay_to_inactive_topics( topic_last_popularity[topic] = decayed_popularity -def analyze_signal(bertrend, topic_number: int, current_date): +def analyze_signal(bertrend, topic_number: int, current_date: Timestamp): topic_merge_rows = bertrend.all_merge_histories_df[ bertrend.all_merge_histories_df["Topic1"] == topic_number ].sort_values("Timestamp") diff --git a/getting_started/bertrend_quickstart.ipynb b/getting_started/bertrend_quickstart.ipynb new file mode 100644 index 0000000..f552fac --- /dev/null +++ b/getting_started/bertrend_quickstart.ipynb @@ -0,0 +1,2051 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "\n", + "\n" + ], + "id": "73870050e69c50e6" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# BERTrend quickstart\n", + "The purpose of this notebook is to complement the existing demos available in the directory `bertrend/demos` with some code examples that explain how to integrate BERTrend with your application code." + ], + "id": "10a9d82c667c7fbe" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:00:24.370757Z", + "start_time": "2025-01-20T15:00:24.349873Z" + } + }, + "cell_type": "code", + "source": [ + "\n", + "%load_ext autoreload\n", + "%autoreload 2" + ], + "id": "849734b0d71f2495", + "outputs": [], + "execution_count": 1 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## BERTrend installation", + "id": "a795490c2d3e539e" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-26T21:07:28.378082Z", + "start_time": "2025-01-26T21:07:28.370941Z" + } + }, + "cell_type": "code", + "source": [ + "import json\n", + "from pathlib import Path\n", + "import pandas as pd\n", + "from pandas import Timestamp\n", + "from IPython.display import display\n", + "from loguru import logger\n", + "\n", + "from bertrend import DATA_PATH\n", + "from bertrend.BERTrend import BERTrend\n", + "from bertrend import MODELS_DIR\n", + "from bertrend.utils.data_loading import load_data, split_data, TEXT_COLUMN\n", + "from bertrend.services.embedding_service import EmbeddingService\n", + "from bertrend.BERTopicModel import BERTopicModel\n", + "from bertrend.topic_analysis.topic_description import generate_topic_description\n", + "from bertrend.trend_analysis.weak_signals import analyze_signal\n" + ], + "id": "ba4a7eacde91b892", + "outputs": [], + "execution_count": 54 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-26T21:07:30.328141Z", + "start_time": "2025-01-26T21:07:30.324568Z" + } + }, + "cell_type": "code", + "source": "#!pip install bertrend", + "id": "74702a2391f80f72", + "outputs": [], + "execution_count": 55 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "### Configuration of topic models", + "id": "ca03bdd5398b56b3" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:00:35.343828Z", + "start_time": "2025-01-20T15:00:35.298417Z" + } + }, + "cell_type": "code", + "source": [ + "# Topic model with default parameters - each parameter of BERTopic can be modified from the constructor or can be read from a configuration file\n", + "# overrides the default config to use English\n", + "config = '''\n", + "# Default configuration file to be used for topic model\n", + "\n", + "# Global parameters\n", + "[global]\n", + "language = \"English\"\n", + "\n", + "# BERTopic parameters: https://maartengr.github.io/BERTopic/api/bertopic.html#bertopic._bertopic.BERTopic.__init__\n", + "[bertopic_model]\n", + "top_n_words = 10\n", + "verbose = true\n", + "representation_model = [\"MaximalMarginalRelevance\"] # KeyBERTInspired, OpenAI\n", + "zeroshot_topic_list = []\n", + "zeroshot_min_similarity = 0\n", + "\n", + "# UMAP parameters: https://umap-learn.readthedocs.io/en/latest/api.html\n", + "[umap_model]\n", + "n_neighbors = 5\n", + "n_components = 5\n", + "min_dist = 0.0\n", + "metric = \"cosine\"\n", + "random_state = 42\n", + "\n", + "# HDBSCAN parameters: https://hdbscan.readthedocs.io/en/latest/api.html\n", + "[hdbscan_model]\n", + "min_cluster_size = 5\n", + "min_samples = 5\n", + "metric = \"euclidean\"\n", + "cluster_selection_method = \"eom\"\n", + "prediction_data = true\n", + "\n", + "# CountVectorizer: https://scikit-learn.org/1.5/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html\n", + "[vectorizer_model]\n", + "ngram_range = [1, 1]\n", + "stop_words = true # If true, will check `language` parameter and load associated stopwords file\n", + "min_df = 2\n", + "\n", + "# ClassTfidfTransformer: https://maartengr.github.io/BERTopic/api/ctfidf.html\n", + "[ctfidf_model]\n", + "bm25_weighting = false\n", + "reduce_frequent_words = true\n", + "\n", + "# MaximalMarginalRelevance: https://maartengr.github.io/BERTopic/api/representation/mmr.html\n", + "[mmr_model]\n", + "diversity = 0.3\n", + "\n", + "# Reduce outliers: https://maartengr.github.io/BERTopic/api/bertopic.html#bertopic._bertopic.BERTopic.reduce_outliers\n", + "[reduce_outliers]\n", + "strategy = \"c-tf-idf\"\n", + "'''\n", + "\n", + "topic_model = BERTopicModel(config)" + ], + "id": "b97d93ac81a4d420", + "outputs": [], + "execution_count": 6 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:00:35.547370Z", + "start_time": "2025-01-20T15:00:35.486087Z" + } + }, + "cell_type": "code", + "source": "# The TopicModel class is mainly a wrapper around BERTopic and can be used as-is, for example for a first analysis of data (without considering evolving trends, but this is not mandatory at all)\n", + "id": "fa92f4b55e7b7b72", + "outputs": [], + "execution_count": 8 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## Using BERTrend for retrospective analysis", + "id": "7cfd832467877a23" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "### Instantiation of BERTrend\n", + "id": "6a07ec11284b82cb" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "In the case of a **retrospective trend analysis** task, the goal is to identify and evaluate patterns or changes over time within a dataset, allowing for insights into historical performance, behaviors, or events that can inform future decision-making and strategy development.\n", + "\n", + "In this context, the general principle consists in splitting the past data into different time slices. Then each dataset is used to train a separate topic models. Each topic model description corresponding to the older data slice is merged with the next one and decay factors are applied. This allows to have a vision of topic evolution over time" + ], + "id": "c5118dce73f8cfce" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:00:35.784959Z", + "start_time": "2025-01-20T15:00:35.745153Z" + } + }, + "cell_type": "code", + "outputs": [], + "execution_count": 9, + "source": [ + "# Basic creation of the object and parametrization\n", + "# BERTrend uses several topic models; therefore, it is necessary to pass a topic_model object as a reference\n", + "bertrend = BERTrend(topic_model=topic_model)" + ], + "id": "52bc66eed5bb040" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "### 1. Gather historical data to be analyzed\n", + "id": "bf7cd6699bf77299" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:00:35.978219Z", + "start_time": "2025-01-20T15:00:35.813108Z" + } + }, + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": [ + " ID timestamp \\\n", + "0 @realDonaldTrump 2017-01-20 06:31 \n", + "1 @realDonaldTrump 2017-01-20 11:51 \n", + "2 @realDonaldTrump 2017-01-20 11:51 \n", + "3 @realDonaldTrump 2017-01-20 11:52 \n", + "4 @realDonaldTrump 2017-01-20 11:53 \n", + "\n", + " url \\\n", + "0 https://twitter.com/realDonaldTrump/status/822... \n", + "1 https://twitter.com/realDonaldTrump/status/822... \n", + "2 https://twitter.com/realDonaldTrump/status/822... \n", + "3 https://twitter.com/realDonaldTrump/status/822... \n", + "4 https://twitter.com/realDonaldTrump/status/822... \n", + "\n", + " text source \\\n", + "0 It all begins today! I will see you at 11:00 A... @realDonaldTrump \n", + "1 Today we are not merely transferring power fro... @realDonaldTrump \n", + "2 power from Washington, D.C. and giving it back... @realDonaldTrump \n", + "3 What truly matters is not which party controls... @realDonaldTrump \n", + "4 January 20th 2017, will be remembered as the d... @realDonaldTrump \n", + "\n", + " document_id \n", + "0 0 \n", + "1 1 \n", + "2 2 \n", + "3 3 \n", + "4 4 " + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDtimestampurltextsourcedocument_id
0@realDonaldTrump2017-01-20 06:31https://twitter.com/realDonaldTrump/status/822...It all begins today! I will see you at 11:00 A...@realDonaldTrump0
1@realDonaldTrump2017-01-20 11:51https://twitter.com/realDonaldTrump/status/822...Today we are not merely transferring power fro...@realDonaldTrump1
2@realDonaldTrump2017-01-20 11:51https://twitter.com/realDonaldTrump/status/822...power from Washington, D.C. and giving it back...@realDonaldTrump2
3@realDonaldTrump2017-01-20 11:52https://twitter.com/realDonaldTrump/status/822...What truly matters is not which party controls...@realDonaldTrump3
4@realDonaldTrump2017-01-20 11:53https://twitter.com/realDonaldTrump/status/822...January 20th 2017, will be remembered as the d...@realDonaldTrump4
\n", + "
" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 10, + "source": [ + "# Here some Trump tweets from: https://github.com/MarkHershey/CompleteTrumpTweetsArchive/blob/master/data/realDonaldTrump_in_office.csv\n", + "#!wget \"https://raw.githubusercontent.com/MarkHershey/CompleteTrumpTweetsArchive/refs/heads/master/data/realDonaldTrump_in_office.csv\"\n", + "df = pd.read_csv(\"realDonaldTrump_in_office.csv\", sep=',',quotechar='\"', skipinitialspace=True)\n", + "# BERTrend expects specific data format\n", + "df = df.rename(columns={'Time': 'timestamp', 'Tweet URL': 'url', \"Tweet Text\": \"text\"})\n", + "df[\"source\"]=df[\"ID\"]\n", + "df[\"document_id\"] = df.index\n", + "df.reset_index(inplace=True, drop=True)\n", + "df.head(5)" + ], + "id": "154fb553f7004986" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:00:36.089939Z", + "start_time": "2025-01-20T15:00:36.031108Z" + } + }, + "cell_type": "code", + "source": "df.index", + "id": "d2e8b96b46718241", + "outputs": [ + { + "data": { + "text/plain": [ + "RangeIndex(start=0, stop=23075, step=1)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 11 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "### 2. Embed data", + "id": "9d26753d9496a25" + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "# Selection of a subset of data\n", + "df = df.head(1000)\n", + "\n", + "embedding_service_cfg = {\"local\": False, \"host\":\"10.132.5.44\", \"port\": 6464}\n", + "\n", + "embedding_service = EmbeddingService(**embedding_service_cfg)\n", + "embeddings, token_strings, token_embeddings = embedding_service.embed(\n", + " texts=df[\"text\"],\n", + " )" + ], + "id": "1ca3e17198fdbb6a", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "", + "id": "d32f4a70dfe634ba" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "", + "id": "25ef45b3d3e34f4" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "", + "id": "a732b1c0303ce39e" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:01:16.783505Z", + "start_time": "2025-01-20T15:00:36.289891Z" + } + }, + "cell_type": "code", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001B[32m2025-01-20 16:00:36.345\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.services.embedding_service\u001B[0m:\u001B[36m_remote_embed_documents\u001B[0m:\u001B[36m203\u001B[0m - \u001B[34m\u001B[1mComputing embeddings...\u001B[0m\n", + "\u001B[32m2025-01-20 16:01:16.205\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.services.embedding_service\u001B[0m:\u001B[36m_remote_embed_documents\u001B[0m:\u001B[36m210\u001B[0m - \u001B[34m\u001B[1mComputing embeddings done for batch\u001B[0m\n", + "\u001B[32m2025-01-20 16:01:16.779\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.services.embedding_service\u001B[0m:\u001B[36m_get_remote_model_name\u001B[0m:\u001B[36m226\u001B[0m - \u001B[34m\u001B[1mModel name: OrdalieTech/Solon-embeddings-large-0.1\u001B[0m\n" + ] + } + ], + "execution_count": 13, + "source": "", + "id": "7e02db73cd68797a" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:01:16.856529Z", + "start_time": "2025-01-20T15:01:16.812294Z" + } + }, + "cell_type": "code", + "source": "embedding_model_name = embedding_service.embedding_model_name\n", + "id": "72df96f5c7d8d52b", + "outputs": [], + "execution_count": 14 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "### 3. Split the data into time slices\n", + "\n", + "This can be done manually for some reason or can be done automatically based on a specified time granularity" + ], + "id": "2e94b24d1ef107a2" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:01:16.964906Z", + "start_time": "2025-01-20T15:01:16.921763Z" + } + }, + "cell_type": "code", + "source": [ + "from bertrend.utils.data_loading import group_by_days, load_data\n", + "\n", + "day_granularity = 30\n", + "grouped_data = group_by_days(df=df, day_granularity=day_granularity)" + ], + "id": "9ea313bff64c8cce", + "outputs": [], + "execution_count": 16 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:01:17.040491Z", + "start_time": "2025-01-20T15:01:16.997388Z" + } + }, + "cell_type": "code", + "source": [ + "# Number of sliced data\n", + "len(grouped_data)" + ], + "id": "a89b3c810c4575bc", + "outputs": [ + { + "data": { + "text/plain": [ + "6" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 17 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "### 4. Train topic models", + "id": "9d7ffa03a6ed9330" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:07:11.584568Z", + "start_time": "2025-01-20T15:01:17.180822Z" + } + }, + "cell_type": "code", + "source": "bertrend.train_topic_models(grouped_data=grouped_data, embedding_model=embedding_model_name, embeddings=embeddings)", + "id": "8e11789ecb115639", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001B[32m2025-01-20 16:01:17.216\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36mtrain_topic_models\u001B[0m:\u001B[36m240\u001B[0m - \u001B[1mTraining topic model 1/6...\u001B[0m\n", + "\u001B[32m2025-01-20 16:01:17.217\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m148\u001B[0m - \u001B[34m\u001B[1mProcessing period: 2017-01-20 00:00:00\u001B[0m\n", + "\u001B[32m2025-01-20 16:01:17.218\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m149\u001B[0m - \u001B[34m\u001B[1mNumber of documents: 184\u001B[0m\n", + "\u001B[32m2025-01-20 16:01:17.218\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m151\u001B[0m - \u001B[34m\u001B[1mCreating topic model...\u001B[0m\n", + "\u001B[32m2025-01-20 16:01:17.219\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m212\u001B[0m - \u001B[34m\u001B[1m\tInitializing BERTopic model\u001B[0m\n", + "\u001B[32m2025-01-20 16:01:17.221\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m222\u001B[0m - \u001B[32m\u001B[1m\tBERTopic model instance created successfully\u001B[0m\n", + "\u001B[32m2025-01-20 16:01:17.221\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m224\u001B[0m - \u001B[34m\u001B[1m\tFitting BERTopic model\u001B[0m\n", + "2025-01-20 16:01:19,876 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n", + "2025-01-20 16:01:25,485 - BERTopic - Dimensionality - Completed ✓\n", + "2025-01-20 16:01:25,486 - BERTopic - Cluster - Start clustering the reduced embeddings\n", + "2025-01-20 16:01:25,494 - BERTopic - Cluster - Completed ✓\n", + "2025-01-20 16:01:25,497 - BERTopic - Representation - Extracting topics from clusters using representation models.\n", + "2025-01-20 16:01:44,786 - BERTopic - Representation - Completed ✓\n", + "\u001B[32m2025-01-20 16:01:44.829\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m231\u001B[0m - \u001B[34m\u001B[1m\tReducing outliers\u001B[0m\n", + "2025-01-20 16:01:44,832 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.\n", + "\u001B[32m2025-01-20 16:01:58.869\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m259\u001B[0m - \u001B[32m\u001B[1m\tBERTopic model fitted successfully\u001B[0m\n", + "\u001B[32m2025-01-20 16:01:58.871\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m158\u001B[0m - \u001B[34m\u001B[1mTopic model created successfully\u001B[0m\n", + "\u001B[32m2025-01-20 16:01:58.911\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36mtrain_topic_models\u001B[0m:\u001B[36m244\u001B[0m - \u001B[34m\u001B[1mSuccessfully processed period: 2017-01-20 00:00:00\u001B[0m\n", + "\u001B[32m2025-01-20 16:01:58.912\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36mtrain_topic_models\u001B[0m:\u001B[36m240\u001B[0m - \u001B[1mTraining topic model 2/6...\u001B[0m\n", + "\u001B[32m2025-01-20 16:01:58.914\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m148\u001B[0m - \u001B[34m\u001B[1mProcessing period: 2017-02-19 00:00:00\u001B[0m\n", + "\u001B[32m2025-01-20 16:01:58.915\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m149\u001B[0m - \u001B[34m\u001B[1mNumber of documents: 123\u001B[0m\n", + "\u001B[32m2025-01-20 16:01:58.921\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m151\u001B[0m - \u001B[34m\u001B[1mCreating topic model...\u001B[0m\n", + "\u001B[32m2025-01-20 16:01:58.922\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m212\u001B[0m - \u001B[34m\u001B[1m\tInitializing BERTopic model\u001B[0m\n", + "\u001B[32m2025-01-20 16:01:58.924\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m222\u001B[0m - \u001B[32m\u001B[1m\tBERTopic model instance created successfully\u001B[0m\n", + "\u001B[32m2025-01-20 16:01:58.925\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m224\u001B[0m - \u001B[34m\u001B[1m\tFitting BERTopic model\u001B[0m\n", + "2025-01-20 16:02:00,684 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n", + "2025-01-20 16:02:00,878 - BERTopic - Dimensionality - Completed ✓\n", + "2025-01-20 16:02:00,879 - BERTopic - Cluster - Start clustering the reduced embeddings\n", + "2025-01-20 16:02:00,886 - BERTopic - Cluster - Completed ✓\n", + "2025-01-20 16:02:00,889 - BERTopic - Representation - Extracting topics from clusters using representation models.\n", + "2025-01-20 16:02:25,983 - BERTopic - Representation - Completed ✓\n", + "\u001B[32m2025-01-20 16:02:26.023\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m231\u001B[0m - \u001B[34m\u001B[1m\tReducing outliers\u001B[0m\n", + "2025-01-20 16:02:26,029 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.\n", + "\u001B[32m2025-01-20 16:02:43.040\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m259\u001B[0m - \u001B[32m\u001B[1m\tBERTopic model fitted successfully\u001B[0m\n", + "\u001B[32m2025-01-20 16:02:43.043\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m158\u001B[0m - \u001B[34m\u001B[1mTopic model created successfully\u001B[0m\n", + "\u001B[32m2025-01-20 16:02:43.101\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36mtrain_topic_models\u001B[0m:\u001B[36m244\u001B[0m - \u001B[34m\u001B[1mSuccessfully processed period: 2017-02-19 00:00:00\u001B[0m\n", + "\u001B[32m2025-01-20 16:02:43.102\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36mtrain_topic_models\u001B[0m:\u001B[36m240\u001B[0m - \u001B[1mTraining topic model 3/6...\u001B[0m\n", + "\u001B[32m2025-01-20 16:02:43.103\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m148\u001B[0m - \u001B[34m\u001B[1mProcessing period: 2017-03-21 00:00:00\u001B[0m\n", + "\u001B[32m2025-01-20 16:02:43.104\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m149\u001B[0m - \u001B[34m\u001B[1mNumber of documents: 132\u001B[0m\n", + "\u001B[32m2025-01-20 16:02:43.107\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m151\u001B[0m - \u001B[34m\u001B[1mCreating topic model...\u001B[0m\n", + "\u001B[32m2025-01-20 16:02:43.111\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m212\u001B[0m - \u001B[34m\u001B[1m\tInitializing BERTopic model\u001B[0m\n", + "\u001B[32m2025-01-20 16:02:43.115\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m222\u001B[0m - \u001B[32m\u001B[1m\tBERTopic model instance created successfully\u001B[0m\n", + "\u001B[32m2025-01-20 16:02:43.119\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m224\u001B[0m - \u001B[34m\u001B[1m\tFitting BERTopic model\u001B[0m\n", + "2025-01-20 16:02:45,433 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n", + "2025-01-20 16:02:45,619 - BERTopic - Dimensionality - Completed ✓\n", + "2025-01-20 16:02:45,620 - BERTopic - Cluster - Start clustering the reduced embeddings\n", + "2025-01-20 16:02:45,628 - BERTopic - Cluster - Completed ✓\n", + "2025-01-20 16:02:45,631 - BERTopic - Representation - Extracting topics from clusters using representation models.\n", + "2025-01-20 16:03:14,636 - BERTopic - Representation - Completed ✓\n", + "\u001B[32m2025-01-20 16:03:14.732\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m231\u001B[0m - \u001B[34m\u001B[1m\tReducing outliers\u001B[0m\n", + "2025-01-20 16:03:14,738 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.\n", + "\u001B[32m2025-01-20 16:03:36.833\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m259\u001B[0m - \u001B[32m\u001B[1m\tBERTopic model fitted successfully\u001B[0m\n", + "\u001B[32m2025-01-20 16:03:36.835\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m158\u001B[0m - \u001B[34m\u001B[1mTopic model created successfully\u001B[0m\n", + "\u001B[32m2025-01-20 16:03:36.871\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36mtrain_topic_models\u001B[0m:\u001B[36m244\u001B[0m - \u001B[34m\u001B[1mSuccessfully processed period: 2017-03-21 00:00:00\u001B[0m\n", + "\u001B[32m2025-01-20 16:03:36.874\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36mtrain_topic_models\u001B[0m:\u001B[36m240\u001B[0m - \u001B[1mTraining topic model 4/6...\u001B[0m\n", + "\u001B[32m2025-01-20 16:03:36.879\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m148\u001B[0m - \u001B[34m\u001B[1mProcessing period: 2017-04-20 00:00:00\u001B[0m\n", + "\u001B[32m2025-01-20 16:03:36.882\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m149\u001B[0m - \u001B[34m\u001B[1mNumber of documents: 168\u001B[0m\n", + "\u001B[32m2025-01-20 16:03:36.884\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m151\u001B[0m - \u001B[34m\u001B[1mCreating topic model...\u001B[0m\n", + "\u001B[32m2025-01-20 16:03:36.885\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m212\u001B[0m - \u001B[34m\u001B[1m\tInitializing BERTopic model\u001B[0m\n", + "\u001B[32m2025-01-20 16:03:36.887\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m222\u001B[0m - \u001B[32m\u001B[1m\tBERTopic model instance created successfully\u001B[0m\n", + "\u001B[32m2025-01-20 16:03:36.910\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m224\u001B[0m - \u001B[34m\u001B[1m\tFitting BERTopic model\u001B[0m\n", + "2025-01-20 16:03:39,750 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n", + "2025-01-20 16:03:40,226 - BERTopic - Dimensionality - Completed ✓\n", + "2025-01-20 16:03:40,227 - BERTopic - Cluster - Start clustering the reduced embeddings\n", + "2025-01-20 16:03:40,253 - BERTopic - Cluster - Completed ✓\n", + "2025-01-20 16:03:40,261 - BERTopic - Representation - Extracting topics from clusters using representation models.\n", + "2025-01-20 16:04:04,727 - BERTopic - Representation - Completed ✓\n", + "\u001B[32m2025-01-20 16:04:04.818\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m231\u001B[0m - \u001B[34m\u001B[1m\tReducing outliers\u001B[0m\n", + "2025-01-20 16:04:04,823 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.\n", + "\u001B[32m2025-01-20 16:04:18.895\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m259\u001B[0m - \u001B[32m\u001B[1m\tBERTopic model fitted successfully\u001B[0m\n", + "\u001B[32m2025-01-20 16:04:18.898\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m158\u001B[0m - \u001B[34m\u001B[1mTopic model created successfully\u001B[0m\n", + "\u001B[32m2025-01-20 16:04:18.951\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36mtrain_topic_models\u001B[0m:\u001B[36m244\u001B[0m - \u001B[34m\u001B[1mSuccessfully processed period: 2017-04-20 00:00:00\u001B[0m\n", + "\u001B[32m2025-01-20 16:04:18.954\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36mtrain_topic_models\u001B[0m:\u001B[36m240\u001B[0m - \u001B[1mTraining topic model 5/6...\u001B[0m\n", + "\u001B[32m2025-01-20 16:04:18.956\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m148\u001B[0m - \u001B[34m\u001B[1mProcessing period: 2017-05-20 00:00:00\u001B[0m\n", + "\u001B[32m2025-01-20 16:04:18.959\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m149\u001B[0m - \u001B[34m\u001B[1mNumber of documents: 161\u001B[0m\n", + "\u001B[32m2025-01-20 16:04:18.960\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m151\u001B[0m - \u001B[34m\u001B[1mCreating topic model...\u001B[0m\n", + "\u001B[32m2025-01-20 16:04:18.961\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m212\u001B[0m - \u001B[34m\u001B[1m\tInitializing BERTopic model\u001B[0m\n", + "\u001B[32m2025-01-20 16:04:18.965\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m222\u001B[0m - \u001B[32m\u001B[1m\tBERTopic model instance created successfully\u001B[0m\n", + "\u001B[32m2025-01-20 16:04:18.969\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m224\u001B[0m - \u001B[34m\u001B[1m\tFitting BERTopic model\u001B[0m\n", + "2025-01-20 16:04:21,148 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n", + "2025-01-20 16:04:21,368 - BERTopic - Dimensionality - Completed ✓\n", + "2025-01-20 16:04:21,368 - BERTopic - Cluster - Start clustering the reduced embeddings\n", + "2025-01-20 16:04:21,377 - BERTopic - Cluster - Completed ✓\n", + "2025-01-20 16:04:21,381 - BERTopic - Representation - Extracting topics from clusters using representation models.\n", + "2025-01-20 16:04:55,543 - BERTopic - Representation - Completed ✓\n", + "\u001B[32m2025-01-20 16:04:55.638\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m231\u001B[0m - \u001B[34m\u001B[1m\tReducing outliers\u001B[0m\n", + "2025-01-20 16:04:55,642 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.\n", + "\u001B[32m2025-01-20 16:05:25.159\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m259\u001B[0m - \u001B[32m\u001B[1m\tBERTopic model fitted successfully\u001B[0m\n", + "\u001B[32m2025-01-20 16:05:25.161\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m158\u001B[0m - \u001B[34m\u001B[1mTopic model created successfully\u001B[0m\n", + "\u001B[32m2025-01-20 16:05:25.192\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36mtrain_topic_models\u001B[0m:\u001B[36m244\u001B[0m - \u001B[34m\u001B[1mSuccessfully processed period: 2017-05-20 00:00:00\u001B[0m\n", + "\u001B[32m2025-01-20 16:05:25.194\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36mtrain_topic_models\u001B[0m:\u001B[36m240\u001B[0m - \u001B[1mTraining topic model 6/6...\u001B[0m\n", + "\u001B[32m2025-01-20 16:05:25.194\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m148\u001B[0m - \u001B[34m\u001B[1mProcessing period: 2017-06-19 00:00:00\u001B[0m\n", + "\u001B[32m2025-01-20 16:05:25.195\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m149\u001B[0m - \u001B[34m\u001B[1mNumber of documents: 232\u001B[0m\n", + "\u001B[32m2025-01-20 16:05:25.196\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m151\u001B[0m - \u001B[34m\u001B[1mCreating topic model...\u001B[0m\n", + "\u001B[32m2025-01-20 16:05:25.196\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m212\u001B[0m - \u001B[34m\u001B[1m\tInitializing BERTopic model\u001B[0m\n", + "\u001B[32m2025-01-20 16:05:25.197\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m222\u001B[0m - \u001B[32m\u001B[1m\tBERTopic model instance created successfully\u001B[0m\n", + "\u001B[32m2025-01-20 16:05:25.198\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m224\u001B[0m - \u001B[34m\u001B[1m\tFitting BERTopic model\u001B[0m\n", + "2025-01-20 16:05:26,934 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n", + "2025-01-20 16:05:27,265 - BERTopic - Dimensionality - Completed ✓\n", + "2025-01-20 16:05:27,265 - BERTopic - Cluster - Start clustering the reduced embeddings\n", + "2025-01-20 16:05:27,276 - BERTopic - Cluster - Completed ✓\n", + "2025-01-20 16:05:27,278 - BERTopic - Representation - Extracting topics from clusters using representation models.\n", + "2025-01-20 16:06:25,302 - BERTopic - Representation - Completed ✓\n", + "\u001B[32m2025-01-20 16:06:25.384\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m231\u001B[0m - \u001B[34m\u001B[1m\tReducing outliers\u001B[0m\n", + "2025-01-20 16:06:25,388 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.\n", + "\u001B[32m2025-01-20 16:07:11.512\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m259\u001B[0m - \u001B[32m\u001B[1m\tBERTopic model fitted successfully\u001B[0m\n", + "\u001B[32m2025-01-20 16:07:11.516\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m158\u001B[0m - \u001B[34m\u001B[1mTopic model created successfully\u001B[0m\n", + "\u001B[32m2025-01-20 16:07:11.563\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36mtrain_topic_models\u001B[0m:\u001B[36m244\u001B[0m - \u001B[34m\u001B[1mSuccessfully processed period: 2017-06-19 00:00:00\u001B[0m\n", + "\u001B[32m2025-01-20 16:07:11.566\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36mtrain_topic_models\u001B[0m:\u001B[36m268\u001B[0m - \u001B[32m\u001B[1mFinished training all topic models\u001B[0m\n" + ] + } + ], + "execution_count": 19 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "### 5. (Optional) Save trained_models", + "id": "855c151c8cd9f93d" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:07:12.523789Z", + "start_time": "2025-01-20T15:07:12.377692Z" + } + }, + "cell_type": "code", + "source": "bertrend.save_models()", + "id": "2a54146c6b5f591b", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001B[32m2025-01-20 16:07:12.514\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36msave_models\u001B[0m:\u001B[36m652\u001B[0m - \u001B[1mModels saved to: /home/jerome/dev/cache/bertrend/models\u001B[0m\n" + ] + } + ], + "execution_count": 21 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "### 6. Merge models", + "id": "6d76285c9be44e92" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:07:13.179985Z", + "start_time": "2025-01-20T15:07:12.853779Z" + } + }, + "cell_type": "code", + "source": "bertrend.merge_all_models()", + "id": "a95fd062728118e9", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001B[32m2025-01-20 16:07:13.172\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36mmerge_all_models\u001B[0m:\u001B[36m351\u001B[0m - \u001B[32m\u001B[1mAll models merged successfully\u001B[0m\n" + ] + } + ], + "execution_count": 23 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "### 7. Calculate signal popularity", + "id": "d5cbf21f65102cd5" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:07:13.819430Z", + "start_time": "2025-01-20T15:07:13.579473Z" + } + }, + "cell_type": "code", + "source": "bertrend.calculate_signal_popularity()", + "id": "94859eb8b9944224", + "outputs": [], + "execution_count": 25 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:07:13.939621Z", + "start_time": "2025-01-20T15:07:13.854683Z" + } + }, + "cell_type": "code", + "source": [ + "# List of topic models\n", + "bertrend.topic_models" + ], + "id": "7a989f7d97083e70", + "outputs": [ + { + "data": { + "text/plain": [ + "{Timestamp('2017-01-20 00:00:00'): ,\n", + " Timestamp('2017-02-19 00:00:00'): ,\n", + " Timestamp('2017-03-21 00:00:00'): ,\n", + " Timestamp('2017-04-20 00:00:00'): ,\n", + " Timestamp('2017-05-20 00:00:00'): ,\n", + " Timestamp('2017-06-19 00:00:00'): }" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 26 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:07:14.331855Z", + "start_time": "2025-01-20T15:07:14.116053Z" + } + }, + "cell_type": "code", + "source": [ + "window_size = 30\n", + "\n", + "# List of strong and weak signals over time\n", + "for ts in bertrend.topic_models.keys():\n", + " print(ts)\n", + " noise_topics_df, weak_signal_topics_df, strong_signal_topics_df = bertrend.classify_signals(window_size, ts)\n", + " if not weak_signal_topics_df.empty:\n", + " print(\"Weak signals\")\n", + " display(weak_signal_topics_df[[\"Topic\",\"Representation\"]].head(5))\n", + " if not strong_signal_topics_df.empty:\n", + " print(\"Strong signals\")\n", + " display(strong_signal_topics_df[[\"Topic\",\"Representation\"]].head(5))\n", + " print()\n" + ], + "id": "dcba20eeaef6b472", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2017-01-20 00:00:00\n", + "Strong signals\n" + ] + }, + { + "data": { + "text/plain": [ + " Topic Representation\n", + "0 0 healthcare_getting_together_disaster_new_despi...\n", + "1 1 https_great_at_meeting_amp_american_trump_we_f..." + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicRepresentation
00healthcare_getting_together_disaster_new_despi...
11https_great_at_meeting_amp_american_trump_we_f...
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "2017-02-19 00:00:00\n", + "Weak signals\n" + ] + }, + { + "data": { + "text/plain": [ + " Topic Representation\n", + "0 0 win_republicans_immigration_illegal_dems_until...\n", + "1 1 https_our_today_jobs_american_great_at_preside..." + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicRepresentation
00win_republicans_immigration_illegal_dems_until...
11https_our_today_jobs_american_great_at_preside...
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "2017-03-21 00:00:00\n", + "Weak signals\n" + ] + }, + { + "data": { + "text/plain": [ + " Topic Representation\n", + "0 4 night_interviewed_saturday_foxnews_next_tax_me...\n", + "1 10 healthcare_obamacare_plan_dead_lie_great_compa..." + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicRepresentation
04night_interviewed_saturday_foxnews_next_tax_me...
110healthcare_obamacare_plan_dead_lie_great_compa...
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Strong signals\n" + ] + }, + { + "data": { + "text/plain": [ + " Topic Representation\n", + "0 0 fake_news_said_possible_amp_yates_while_china_...\n", + "1 1 https_today_order_at_presidential_foxandfriend...\n", + "2 2 democrats_our_wall_insurance_companies_governm..." + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicRepresentation
00fake_news_said_possible_amp_yates_while_china_...
11https_today_order_at_presidential_foxandfriend...
22democrats_our_wall_insurance_companies_governm...
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "2017-04-20 00:00:00\n", + "Weak signals\n" + ] + }, + { + "data": { + "text/plain": [ + " Topic Representation\n", + "0 2 help_country_our_justice_must_before_peace_his...\n", + "1 4 g7_jobs_terrorism_italy_trip_melania_security_...\n", + "2 8 nato_hard_east_saudi_trying_countries_2016_sho...\n", + "3 9 healthcare_cuts_obamacare_montana_republican_w..." + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicRepresentation
02help_country_our_justice_must_before_peace_his...
14g7_jobs_terrorism_italy_trip_melania_security_...
28nato_hard_east_saudi_trying_countries_2016_sho...
39healthcare_cuts_obamacare_montana_republican_w...
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Strong signals\n" + ] + }, + { + "data": { + "text/plain": [ + " Topic Representation\n", + "0 0 news_media_dems_they_now_no_london_phony_faken...\n", + "1 1 deal_workers_trump_again_promise_realdonaldtru..." + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicRepresentation
00news_media_dems_they_now_no_london_phony_faken...
11deal_workers_trump_again_promise_realdonaldtru...
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "2017-05-20 00:00:00\n", + "Weak signals\n" + ] + }, + { + "data": { + "text/plain": [ + " Topic Representation\n", + "0 12 gop_georgia_foxnews_steel_congressional_foxand..." + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicRepresentation
012gop_georgia_foxnews_steel_congressional_foxand...
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Strong signals\n" + ] + }, + { + "data": { + "text/plain": [ + " Topic Representation\n", + "0 0 fbi_cia_asked_disgraceful_hoax_refused_seat_ta...\n", + "1 1 realdonaldtrump_potus_rt_weekly_friends_trump_...\n", + "2 3 obama_meddling_election_nothing_2016_russian_w...\n", + "3 7 south_deals_uswomensopen_women_meetings_moon_m...\n", + "4 9 democrats_healthcare_would_dems_senate_failed_..." + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicRepresentation
00fbi_cia_asked_disgraceful_hoax_refused_seat_ta...
11realdonaldtrump_potus_rt_weekly_friends_trump_...
23obama_meddling_election_nothing_2016_russian_w...
37south_deals_uswomensopen_women_meetings_moon_m...
49democrats_healthcare_would_dems_senate_failed_...
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "2017-06-19 00:00:00\n", + "Weak signals\n" + ] + }, + { + "data": { + "text/plain": [ + " Topic Representation\n", + "0 3 obama_meddling_election_nothing_2016_russian_w...\n", + "1 7 south_deals_uswomensopen_women_meetings_moon_m...\n", + "2 9 democrats_healthcare_would_dems_senate_failed_...\n", + "3 12 gop_georgia_foxnews_steel_congressional_foxand...\n", + "4 13 market_jobs_another_deal_syria_like_border_ste..." + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicRepresentation
03obama_meddling_election_nothing_2016_russian_w...
17south_deals_uswomensopen_women_meetings_moon_m...
29democrats_healthcare_would_dems_senate_failed_...
312gop_georgia_foxnews_steel_congressional_foxand...
413market_jobs_another_deal_syria_like_border_ste...
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Strong signals\n" + ] + }, + { + "data": { + "text/plain": [ + " Topic Representation\n", + "0 0 fbi_cia_asked_disgraceful_hoax_refused_seat_ta...\n", + "1 1 realdonaldtrump_potus_rt_weekly_friends_trump_..." + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicRepresentation
00fbi_cia_asked_disgraceful_hoax_refused_seat_ta...
11realdonaldtrump_potus_rt_weekly_friends_trump_...
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "execution_count": 27 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-26T19:58:55.911033Z", + "start_time": "2025-01-26T19:58:55.907556Z" + } + }, + "cell_type": "code", + "source": [ + "# selection of one particular timestamp to look at\n", + "selected_timestamp = Timestamp('2017-04-20 00:00:00')\n", + "selected_topic_model = bertrend.topic_models.get(selected_timestamp)\n" + ], + "id": "4582c0cb6c1f6186", + "outputs": [], + "execution_count": 1 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "### Get topic description\n", + "id": "e31285ee5eb9d9f6" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:09:22.901513Z", + "start_time": "2025-01-20T15:09:22.731495Z" + } + }, + "cell_type": "code", + "source": "desc = generate_topic_description(topic_model=selected_topic_model, topic_number=5, filtered_docs=df, language_code=\"en\")\n", + "id": "c945b625df18d881", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001B[32m2025-01-20 16:09:22.895\u001B[0m | \u001B[31m\u001B[1mERROR \u001B[0m | \u001B[36mbertrend.topic_analysis.topic_description\u001B[0m:\u001B[36mgenerate_topic_description\u001B[0m:\u001B[36m51\u001B[0m - \u001B[31m\u001B[1mError calling OpenAI API: ' \"title\"'\u001B[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "'Error generating description: \\' \"title\"\\''" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 38 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:23:13.250764Z", + "start_time": "2025-01-20T15:23:11.647929Z" + } + }, + "cell_type": "code", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001B[32m2025-01-20 16:23:13.247\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.llm_utils.openai_client\u001B[0m:\u001B[36mgenerate_from_history\u001B[0m:\u001B[36m128\u001B[0m - \u001B[34m\u001B[1mAPI returned: ChatCompletion(id='chatcmpl-ArnuKCesKptMpkREbYsHA1tBs6qI2', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{\\n \"title\": \"Inauguration Night: Power Shift and Future Meetings\",\\n \"description\": \"Ce thème explore la dynamique de la nuit d\\'inauguration, marquée par des interviews et des discussions sur les prochaines étapes du gouvernement. Les événements de samedi, notamment sur Fox News, mettent en lumière les enjeux fiscaux et les réunions à venir avec des représentants étrangers. L\\'accent est mis sur le retour du pouvoir aux citoyens américains, soulignant l\\'importance des visites à domicile et des interactions directes. Ce moment symbolique représente un tournant dans la politique américaine, où les attentes et les promesses de changement sont au cœur des préoccupations.\"\\n}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), content_filter_results={'hate': {'filtered': False, 'severity': 'safe'}, 'protected_material_code': {'filtered': False, 'detected': False}, 'protected_material_text': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}})], created=1737386592, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_5154047bf2', usage=CompletionUsage(completion_tokens=136, prompt_tokens=240, total_tokens=376, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)), prompt_filter_results=[{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'jailbreak': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}])\u001B[0m\n" + ] + } + ], + "execution_count": 68, + "source": "desc[\"title\"]", + "id": "e61b903379a0fbd1" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "desc[\"description\"]", + "id": "c4dbdd4998e0956a" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "### Get topic analysis", + "id": "e27e46b0adc6e88b" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:07:17.430211Z", + "start_time": "2025-01-20T15:07:16.745674Z" + } + }, + "cell_type": "code", + "source": "summary, analysis, formatted_html = analyze_signal(bertrend, 7, selected_timestamp)", + "id": "cdc44ef6f558aac0", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001B[32m2025-01-20 16:07:16.814\u001B[0m | \u001B[31m\u001B[1mERROR \u001B[0m | \u001B[36mbertrend.trend_analysis.weak_signals\u001B[0m:\u001B[36manalyze_signal\u001B[0m:\u001B[36m416\u001B[0m - \u001B[31m\u001B[1mNo data available for topic 7 within the specified date range. Please enter a valid topic number.\u001B[0m\n" + ] + }, + { + "ename": "Exception", + "evalue": "No data available for topic 7 within the specified date range. Please enter a valid topic number.", + "output_type": "error", + "traceback": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mException\u001B[0m Traceback (most recent call last)", + "Cell \u001B[0;32mIn[33], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m summary, analysis, formatted_html \u001B[38;5;241m=\u001B[39m \u001B[43manalyze_signal\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m 2\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;241;43m7\u001B[39;49m\u001B[43m,\u001B[49m\n\u001B[1;32m 3\u001B[0m \u001B[43m \u001B[49m\u001B[43mselected_timestamp\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 4\u001B[0m \u001B[43m \u001B[49m\u001B[43mbertrend\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mall_merge_histories_df\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 5\u001B[0m \u001B[43m \u001B[49m\u001B[43mbertrend\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mconfig\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mgranularity\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 6\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mEnglish\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\n\u001B[1;32m 7\u001B[0m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n", + "File \u001B[0;32m~/dev/BERTrend/bertrend/trend_analysis/weak_signals.py:417\u001B[0m, in \u001B[0;36manalyze_signal\u001B[0;34m(topic_number, current_date, all_merge_histories_df, granularity, language)\u001B[0m\n\u001B[1;32m 415\u001B[0m error_msg \u001B[38;5;241m=\u001B[39m \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mNo data available for topic \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mtopic_number\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m within the specified date range. Please enter a valid topic number.\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m 416\u001B[0m logger\u001B[38;5;241m.\u001B[39merror(error_msg)\n\u001B[0;32m--> 417\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m(error_msg)\n", + "\u001B[0;31mException\u001B[0m: No data available for topic 7 within the specified date range. Please enter a valid topic number." + ] + } + ], + "execution_count": 33 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:07:17.433647087Z", + "start_time": "2025-01-19T14:38:52.904786Z" + } + }, + "cell_type": "code", + "source": [ + "from IPython.display import display, HTML\n", + "display(HTML(formatted_html))" + ], + "id": "531558c5b600cb30", + "outputs": [ + { + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "```html\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " Tensions and Diplomatic Engagements with North Korea Analysis Dashboard\n", + " \n", + "\n", + "\n", + "
\n", + "
\n", + "

Topic Evolution

\n", + "
\n", + "

Date: May 20, 2017: Tensions and Diplomatic Engagements with North Korea

\n", + "

- John McCain and Lindsey Graham criticized for being weak on immigration in a joint statement.
\n", + " - President Trump hosted Japanese Prime Minister Shinzo Abe at Mar-a-Lago, emphasizing strong bilateral relations.
\n", + " - A joint statement regarding North Korea was released following productive talks with Prime Minister Abe.

\n", + "
\n", + " What's New: This period introduces a more aggressive U.S. approach to North Korea, contrasting with the previous focus on diplomatic engagement. Additionally, the emphasis on trade negotiations signals a growing concern over economic relationships and their impact on national security.\n", + "
\n", + "
\n", + "
\n", + "

Date: May 22, 2017: Strategic Shifts in U.S. Foreign Policy and Trade

\n", + "

- President Trump declared the end of strategic patience with North Korea, signaling a shift in U.S. policy.
\n", + " - A productive meeting with South Korean President Moon Jae-in focused on North Korea and trade deals.
\n", + " - The G20 Summit discussions included trade negotiations and international relations, particularly with China.

\n", + "
\n", + " What's New: The transition from strategic patience to a more assertive stance on North Korea marks a significant shift in U.S. foreign policy, potentially leading to increased tensions in the region.\n", + "
\n", + "
\n", + " \n", + "

Evolution Scenarios

\n", + "
\n", + "

Optimistic Scenario

\n", + "

The U.S. could leverage its strengthened alliances to engage North Korea in meaningful negotiations, leading to denuclearization talks and a reduction in military tensions.

\n", + "
    \n", + "
  • Successful diplomacy could lead to a more stable Asia-Pacific region.
  • \n", + "
  • Economic cooperation may enhance regional stability.
  • \n", + "
  • Strengthened alliances could deter North Korean provocations.
  • \n", + "
  • Improved economic conditions for all parties involved.
  • \n", + "
\n", + "
\n", + "
\n", + "

Pessimistic Scenario

\n", + "

The aggressive stance could lead to miscalculations and an unintended military confrontation, resulting in significant loss of life and destabilization of the region.

\n", + "
    \n", + "
  • Escalation to conflict could destabilize the Asia-Pacific region.
  • \n", + "
  • Increased isolationism may diminish U.S. global influence.
  • \n", + "
  • Failure to manage relations could lead to a vacuum filled by adversarial powers.
  • \n", + "
  • Humanitarian crises may arise from prolonged sanctions.
  • \n", + "
\n", + "
\n", + "
\n", + "
\n", + "

Topic Analysis

\n", + "
\n", + "
\n", + "

Short-term Implications

\n", + "
    \n", + "
  • Geopolitical tensions could lead to immediate escalations in military posturing.
  • \n", + "
  • Economic repercussions may disrupt international trade, particularly with China.
  • \n", + "
  • Domestic political landscape may polarize further around immigration and foreign policy.
  • \n", + "
  • Increased military exercises in the region could provoke North Korea.
  • \n", + "
\n", + "
\n", + "
\n", + "

Long-term Implications

\n", + "
    \n", + "
  • Strengthening ties with Japan and South Korea could lead to a cohesive security framework.
  • \n", + "
  • Normalization of military engagement may prioritize military readiness over diplomacy.
  • \n", + "
  • Economic isolation of North Korea could lead to long-term instability.
  • \n", + "
  • Potential regime change or humanitarian crises in North Korea.
  • \n", + "
\n", + "
\n", + "
\n", + "

Ripple Effects

\n", + "
    \n", + "
  • Increased tensions may prompt neighboring countries to enhance military capabilities.
  • \n", + "
  • Humanitarian concerns could exacerbate issues in North Korea, leading to refugee flows.
  • \n", + "
  • Global trade dynamics may shift due to U.S. trade policy changes.
  • \n", + "
  • Potential realignments in global trade partnerships affecting economies worldwide.
  • \n", + "
\n", + "
\n", + "
\n", + "

Interconnections

\n", + "
    \n", + "
  • Emerging technologies may drive investments in defense technologies.
  • \n", + "
  • Global supply chain resilience may lead to innovations in logistics.
  • \n", + "
  • Human rights advocacy could galvanize international organizations.
  • \n", + "
  • Increased awareness of humanitarian issues may influence U.S. policy changes.
  • \n", + "
\n", + "
\n", + "
\n", + "

Drivers

\n", + "
    \n", + "
  • A strong political mandate could accelerate aggressive policies.
  • \n", + "
  • Public sentiment may bolster support for military readiness.
  • \n", + "
  • International support from allies could amplify U.S. efforts.
  • \n", + "
  • Growing concerns over national security could drive policy changes.
  • \n", + "
\n", + "
\n", + "
\n", + "

Inhibitors

\n", + "
    \n", + "
  • Diplomatic resistance from countries like China and Russia may complicate efforts.
  • \n", + "
  • Economic consequences from trade disruptions could lead to political pressure.
  • \n", + "
  • Humanitarian concerns may lead to calls for more diplomatic engagement.
  • \n", + "
  • Backlash from U.S. businesses over trade policies could hinder aggressive actions.
  • \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + "```" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 39 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "", + "id": "d4c54df2e25f24c9" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## Using BERTrend for prospective analysis", + "id": "c922549ec07859a9" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "In the case of a **prospective trend analysis task**, the goal is to **forecast future** developments or outcomes based on current data and trends, enabling organizations to make informed decisions, allocate resources effectively, and strategize for upcoming challenges or opportunities.\n", + "id": "cd3a1210eb53e1e2" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "In this example, we are going to simulate a prospective task:\n", + "- we simulate new data coming in\n", + "- for each new data, we will compute the new topic model, merge it to previous one and detect at each iteration strong and weak signals\n" + ], + "id": "100f841b083ce637" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-26T20:42:31.675644Z", + "start_time": "2025-01-26T20:42:31.671870Z" + } + }, + "cell_type": "code", + "source": [ + "MY_DATA_DIR = DATA_PATH / \"feeds/feed_sobriete\"\n", + "\n", + "input_data = [\n", + " MY_DATA_DIR / \"2024-12-30_feed_sobriete.jsonl\",\n", + " MY_DATA_DIR / \"2025-01-06_feed_sobriete.jsonl\",\n", + " MY_DATA_DIR / \"2025-01-20_feed_sobriete.jsonl\",\n", + "]\n", + "\n", + "window_size = 7" + ], + "id": "4d88b099fc25b600", + "outputs": [], + "execution_count": 18 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-26T21:04:04.405304Z", + "start_time": "2025-01-26T21:04:04.401150Z" + } + }, + "cell_type": "code", + "source": [ + "embedding_service_cfg = {\"local\": False, \"host\":\"10.132.5.44\", \"port\": 6464}\n", + "\n", + "embedding_service = EmbeddingService(**embedding_service_cfg)\n", + "embedding_model_name = embedding_service.embedding_model_name" + ], + "id": "a4619e8b7e9fbf91", + "outputs": [], + "execution_count": 48 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-26T20:41:23.289362Z", + "start_time": "2025-01-26T20:41:23.284555Z" + } + }, + "cell_type": "code", + "outputs": [], + "execution_count": 13, + "source": "BERTREND_MODELS_PATH = MODELS_DIR / \"sobriete_models\"", + "id": "29f00b403ea81df1" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-26T21:08:01.471923Z", + "start_time": "2025-01-26T21:08:01.464866Z" + } + }, + "cell_type": "code", + "source": [ + "def process_new_data(data_slice_path: Path, timestamp: pd.Timestamp):\n", + " logger.debug(f\"Processing new data: {data_slice_path}\")\n", + "\n", + " # Restore previous models\n", + " try:\n", + " bertrend = BERTrend.restore_models(BERTREND_MODELS_PATH)\n", + " except:\n", + " logger.warning(\"Cannot restore previous models, creating new one\")\n", + " bertrend = BERTrend(topic_model=BERTopicModel())\n", + "\n", + " # Read data\n", + " df = load_data(data_slice_path, language=\"French\")\n", + " df = split_data(df)\n", + " text = df[TEXT_COLUMN]\n", + "\n", + " # Embed new data\n", + " embeddings, token_strings, token_embeddings = embedding_service.embed(\n", + " texts=text,\n", + " )\n", + "\n", + " # Create topic model for new data\n", + " bertrend.train_topic_models({timestamp: df}, embeddings=embeddings, embedding_model=embedding_model_name)\n", + "\n", + " # Merge models\n", + " bertrend.merge_all_models()\n", + "\n", + " # Compute popularities\n", + " bertrend.calculate_signal_popularity()\n", + "\n", + " # classify last signals\n", + " noise_topics_df, weak_signal_topics_df, strong_signal_topics_df = bertrend.classify_signals(window_size, timestamp)\n", + " # TODO: save dfs\n", + " wt = noise_topics_df['Topic']\n", + " logger.info(f\"Weak topics: {wt}\")\n", + " for topic in wt:\n", + " desc = generate_topic_description(topic_model=bertrend.topic_models[timestamp], topic_number=topic, filtered_docs=df, language_code=\"fr\")\n", + " logger.info(f\"Topic: {topic}\\t\\t{desc['title']}\\n{desc['description']}\")\n", + "\n", + "\n", + " # Save models\n", + " bertrend.save_models(models_path=BERTREND_MODELS_PATH)\n", + "\n" + ], + "id": "63e3d13a7d8c0cb", + "outputs": [], + "execution_count": 56 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-26T21:05:36.124752Z", + "start_time": "2025-01-26T21:05:36.122652Z" + } + }, + "cell_type": "code", + "source": "", + "id": "1b36e0e226103b8c", + "outputs": [], + "execution_count": null + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-26T21:12:53.800721Z", + "start_time": "2025-01-26T21:08:10.434372Z" + } + }, + "cell_type": "code", + "source": [ + "input_data = [\n", + " MY_DATA_DIR / \"2024-12-30_feed_sobriete.jsonl\",\n", + "]\n", + "for data_file in input_data:\n", + " timestamp = pd.Timestamp(data_file.name.split('_')[0])\n", + " process_new_data(data_file, timestamp)" + ], + "id": "a2695805f56be632", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001B[32m2025-01-26 22:08:10.435\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36m__main__\u001B[0m:\u001B[36mprocess_new_data\u001B[0m:\u001B[36m2\u001B[0m - \u001B[34m\u001B[1mProcessing new data: /home/jerome/dev/data/bertrend/feeds/feed_sobriete/2024-12-30_feed_sobriete.jsonl\u001B[0m\n", + "\u001B[32m2025-01-26 22:08:10.439\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36m__main__\u001B[0m:\u001B[36mprocess_new_data\u001B[0m:\u001B[36m8\u001B[0m - \u001B[33m\u001B[1mCannot restore previous models, creating new one\u001B[0m\n", + "\u001B[32m2025-01-26 22:08:10.650\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.services.embedding_service\u001B[0m:\u001B[36m_remote_embed_documents\u001B[0m:\u001B[36m203\u001B[0m - \u001B[34m\u001B[1mComputing embeddings...\u001B[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['title', 'summary', 'link', 'url', 'text', 'timestamp', 'document_id',\n", + " 'source'],\n", + " dtype='object')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001B[32m2025-01-26 22:08:40.748\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.services.embedding_service\u001B[0m:\u001B[36m_remote_embed_documents\u001B[0m:\u001B[36m210\u001B[0m - \u001B[34m\u001B[1mComputing embeddings done for batch\u001B[0m\n", + "\u001B[32m2025-01-26 22:08:40.800\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.services.embedding_service\u001B[0m:\u001B[36m_get_remote_model_name\u001B[0m:\u001B[36m226\u001B[0m - \u001B[34m\u001B[1mModel name: OrdalieTech/Solon-embeddings-large-0.1\u001B[0m\n", + "\u001B[32m2025-01-26 22:08:40.804\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36mtrain_topic_models\u001B[0m:\u001B[36m240\u001B[0m - \u001B[1mTraining topic model 1/1...\u001B[0m\n", + "\u001B[32m2025-01-26 22:08:40.810\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m148\u001B[0m - \u001B[34m\u001B[1mProcessing period: 2024-12-30 00:00:00\u001B[0m\n", + "\u001B[32m2025-01-26 22:08:40.811\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m149\u001B[0m - \u001B[34m\u001B[1mNumber of documents: 932\u001B[0m\n", + "\u001B[32m2025-01-26 22:08:40.812\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTrend\u001B[0m:\u001B[36m_train_by_period\u001B[0m:\u001B[36m151\u001B[0m - \u001B[34m\u001B[1mCreating topic model...\u001B[0m\n", + "\u001B[32m2025-01-26 22:08:40.812\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m212\u001B[0m - \u001B[34m\u001B[1m\tInitializing BERTopic model\u001B[0m\n", + "\u001B[32m2025-01-26 22:08:40.814\u001B[0m | \u001B[32m\u001B[1mSUCCESS \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m222\u001B[0m - \u001B[32m\u001B[1m\tBERTopic model instance created successfully\u001B[0m\n", + "\u001B[32m2025-01-26 22:08:40.814\u001B[0m | \u001B[34m\u001B[1mDEBUG \u001B[0m | \u001B[36mbertrend.BERTopicModel\u001B[0m:\u001B[36mfit\u001B[0m:\u001B[36m224\u001B[0m - \u001B[34m\u001B[1m\tFitting BERTopic model\u001B[0m\n", + "2025-01-26 22:08:43,626 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n", + "2025-01-26 22:08:51,061 - BERTopic - Dimensionality - Completed ✓\n", + "2025-01-26 22:08:51,062 - BERTopic - Cluster - Start clustering the reduced embeddings\n", + "2025-01-26 22:08:51,118 - BERTopic - Cluster - Completed ✓\n", + "2025-01-26 22:08:51,125 - BERTopic - Representation - Extracting topics from clusters using representation models.\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)", + "Cell \u001B[0;32mIn[57], line 6\u001B[0m\n\u001B[1;32m 4\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m data_file \u001B[38;5;129;01min\u001B[39;00m input_data:\n\u001B[1;32m 5\u001B[0m timestamp \u001B[38;5;241m=\u001B[39m pd\u001B[38;5;241m.\u001B[39mTimestamp(data_file\u001B[38;5;241m.\u001B[39mname\u001B[38;5;241m.\u001B[39msplit(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m_\u001B[39m\u001B[38;5;124m'\u001B[39m)[\u001B[38;5;241m0\u001B[39m])\n\u001B[0;32m----> 6\u001B[0m \u001B[43mprocess_new_data\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdata_file\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtimestamp\u001B[49m\u001B[43m)\u001B[49m\n", + "Cell \u001B[0;32mIn[56], line 23\u001B[0m, in \u001B[0;36mprocess_new_data\u001B[0;34m(data_slice_path, timestamp)\u001B[0m\n\u001B[1;32m 18\u001B[0m embeddings, token_strings, token_embeddings \u001B[38;5;241m=\u001B[39m embedding_service\u001B[38;5;241m.\u001B[39membed(\n\u001B[1;32m 19\u001B[0m texts\u001B[38;5;241m=\u001B[39mtext,\n\u001B[1;32m 20\u001B[0m )\n\u001B[1;32m 22\u001B[0m \u001B[38;5;66;03m# Create topic model for new data\u001B[39;00m\n\u001B[0;32m---> 23\u001B[0m \u001B[43mbertrend\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtrain_topic_models\u001B[49m\u001B[43m(\u001B[49m\u001B[43m{\u001B[49m\u001B[43mtimestamp\u001B[49m\u001B[43m:\u001B[49m\u001B[43m \u001B[49m\u001B[43mdf\u001B[49m\u001B[43m}\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43membeddings\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43membeddings\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43membedding_model\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43membedding_model_name\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 25\u001B[0m \u001B[38;5;66;03m# Save models\u001B[39;00m\n\u001B[1;32m 26\u001B[0m bertrend\u001B[38;5;241m.\u001B[39msave_models()\n", + "File \u001B[0;32m~/dev/BERTrend/bertrend/BERTrend.py:242\u001B[0m, in \u001B[0;36mBERTrend.train_topic_models\u001B[0;34m(self, grouped_data, embedding_model, embeddings)\u001B[0m\n\u001B[1;32m 239\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m 240\u001B[0m logger\u001B[38;5;241m.\u001B[39minfo(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mTraining topic model \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mi\u001B[38;5;241m+\u001B[39m\u001B[38;5;241m1\u001B[39m\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m/\u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mlen\u001B[39m(non_empty_groups)\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m...\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 241\u001B[0m topic_models[period], doc_groups[period], emb_groups[period] \u001B[38;5;241m=\u001B[39m (\n\u001B[0;32m--> 242\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_train_by_period\u001B[49m\u001B[43m(\u001B[49m\u001B[43mperiod\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mgroup\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43membedding_model\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43membeddings\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 243\u001B[0m ) \u001B[38;5;66;03m# TODO: parallelize?\u001B[39;00m\n\u001B[1;32m 244\u001B[0m logger\u001B[38;5;241m.\u001B[39mdebug(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mSuccessfully processed period: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mperiod\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 246\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m e:\n", + "File \u001B[0;32m~/dev/BERTrend/bertrend/BERTrend.py:152\u001B[0m, in \u001B[0;36mBERTrend._train_by_period\u001B[0;34m(self, period, group, embedding_model, embeddings)\u001B[0m\n\u001B[1;32m 149\u001B[0m logger\u001B[38;5;241m.\u001B[39mdebug(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mNumber of documents: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mlen\u001B[39m(docs)\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 151\u001B[0m logger\u001B[38;5;241m.\u001B[39mdebug(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCreating topic model...\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m--> 152\u001B[0m topic_model \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtopic_model\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfit\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m 153\u001B[0m \u001B[43m \u001B[49m\u001B[43mdocs\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mdocs\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 154\u001B[0m \u001B[43m \u001B[49m\u001B[43membedding_model\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43membedding_model\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 155\u001B[0m \u001B[43m \u001B[49m\u001B[43membeddings\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43membeddings_subset\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 156\u001B[0m \u001B[43m\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241m.\u001B[39mtopic_model\n\u001B[1;32m 158\u001B[0m logger\u001B[38;5;241m.\u001B[39mdebug(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mTopic model created successfully\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 160\u001B[0m doc_info_df \u001B[38;5;241m=\u001B[39m topic_model\u001B[38;5;241m.\u001B[39mget_document_info(docs\u001B[38;5;241m=\u001B[39mdocs)\n", + "File \u001B[0;32m~/dev/BERTrend/bertrend/BERTopicModel.py:225\u001B[0m, in \u001B[0;36mBERTopicModel.fit\u001B[0;34m(self, docs, embedding_model, embeddings, zeroshot_topic_list, zeroshot_min_similarity)\u001B[0m\n\u001B[1;32m 222\u001B[0m logger\u001B[38;5;241m.\u001B[39msuccess(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;130;01m\\t\u001B[39;00m\u001B[38;5;124mBERTopic model instance created successfully\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 224\u001B[0m logger\u001B[38;5;241m.\u001B[39mdebug(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;130;01m\\t\u001B[39;00m\u001B[38;5;124mFitting BERTopic model\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m--> 225\u001B[0m topics, probs \u001B[38;5;241m=\u001B[39m \u001B[43mtopic_model\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfit_transform\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdocs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43membeddings\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 227\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m topic_model\u001B[38;5;241m.\u001B[39m_outliers:\n\u001B[1;32m 228\u001B[0m logger\u001B[38;5;241m.\u001B[39mwarning(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;130;01m\\t\u001B[39;00m\u001B[38;5;124mNo outliers to reduce.\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/bertopic/_bertopic.py:433\u001B[0m, in \u001B[0;36mBERTopic.fit_transform\u001B[0;34m(self, documents, embeddings, images, y)\u001B[0m\n\u001B[1;32m 430\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_save_representative_docs(custom_documents)\n\u001B[1;32m 431\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m 432\u001B[0m \u001B[38;5;66;03m# Extract topics by calculating c-TF-IDF\u001B[39;00m\n\u001B[0;32m--> 433\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_extract_topics\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdocuments\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43membeddings\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43membeddings\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mverbose\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mverbose\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 435\u001B[0m \u001B[38;5;66;03m# Reduce topics\u001B[39;00m\n\u001B[1;32m 436\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mnr_topics:\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/bertopic/_bertopic.py:3787\u001B[0m, in \u001B[0;36mBERTopic._extract_topics\u001B[0;34m(self, documents, embeddings, mappings, verbose)\u001B[0m\n\u001B[1;32m 3785\u001B[0m documents_per_topic \u001B[38;5;241m=\u001B[39m documents\u001B[38;5;241m.\u001B[39mgroupby([\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mTopic\u001B[39m\u001B[38;5;124m'\u001B[39m], as_index\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m)\u001B[38;5;241m.\u001B[39magg({\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mDocument\u001B[39m\u001B[38;5;124m'\u001B[39m: \u001B[38;5;124m'\u001B[39m\u001B[38;5;124m \u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;241m.\u001B[39mjoin})\n\u001B[1;32m 3786\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mc_tf_idf_, words \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_c_tf_idf(documents_per_topic)\n\u001B[0;32m-> 3787\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtopic_representations_ \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_extract_words_per_topic\u001B[49m\u001B[43m(\u001B[49m\u001B[43mwords\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdocuments\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 3788\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_create_topic_vectors(documents\u001B[38;5;241m=\u001B[39mdocuments, embeddings\u001B[38;5;241m=\u001B[39membeddings, mappings\u001B[38;5;241m=\u001B[39mmappings)\n\u001B[1;32m 3789\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtopic_labels_ \u001B[38;5;241m=\u001B[39m {key: \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mkey\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m_\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m_\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;241m.\u001B[39mjoin([word[\u001B[38;5;241m0\u001B[39m] \u001B[38;5;28;01mfor\u001B[39;00m word \u001B[38;5;129;01min\u001B[39;00m values[:\u001B[38;5;241m4\u001B[39m]])\n\u001B[1;32m 3790\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m key, values \u001B[38;5;129;01min\u001B[39;00m\n\u001B[1;32m 3791\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtopic_representations_\u001B[38;5;241m.\u001B[39mitems()}\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/bertopic/_bertopic.py:4071\u001B[0m, in \u001B[0;36mBERTopic._extract_words_per_topic\u001B[0;34m(self, words, documents, c_tf_idf, calculate_aspects)\u001B[0m\n\u001B[1;32m 4069\u001B[0m topics \u001B[38;5;241m=\u001B[39m tuner\u001B[38;5;241m.\u001B[39mextract_topics(\u001B[38;5;28mself\u001B[39m, documents, c_tf_idf, topics)\n\u001B[1;32m 4070\u001B[0m \u001B[38;5;28;01melif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mrepresentation_model, BaseRepresentation):\n\u001B[0;32m-> 4071\u001B[0m topics \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mrepresentation_model\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mextract_topics\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdocuments\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mc_tf_idf\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtopics\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 4072\u001B[0m \u001B[38;5;28;01melif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mrepresentation_model, \u001B[38;5;28mdict\u001B[39m):\n\u001B[1;32m 4073\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mrepresentation_model\u001B[38;5;241m.\u001B[39mget(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mMain\u001B[39m\u001B[38;5;124m\"\u001B[39m):\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/bertopic/representation/_mmr.py:68\u001B[0m, in \u001B[0;36mMaximalMarginalRelevance.extract_topics\u001B[0;34m(self, topic_model, documents, c_tf_idf, topics)\u001B[0m\n\u001B[1;32m 66\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m topic, topic_words \u001B[38;5;129;01min\u001B[39;00m topics\u001B[38;5;241m.\u001B[39mitems():\n\u001B[1;32m 67\u001B[0m words \u001B[38;5;241m=\u001B[39m [word[\u001B[38;5;241m0\u001B[39m] \u001B[38;5;28;01mfor\u001B[39;00m word \u001B[38;5;129;01min\u001B[39;00m topic_words]\n\u001B[0;32m---> 68\u001B[0m word_embeddings \u001B[38;5;241m=\u001B[39m \u001B[43mtopic_model\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_extract_embeddings\u001B[49m\u001B[43m(\u001B[49m\u001B[43mwords\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mmethod\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mword\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mverbose\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43;01mFalse\u001B[39;49;00m\u001B[43m)\u001B[49m\n\u001B[1;32m 69\u001B[0m topic_embedding \u001B[38;5;241m=\u001B[39m topic_model\u001B[38;5;241m.\u001B[39m_extract_embeddings(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m \u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;241m.\u001B[39mjoin(words), method\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mword\u001B[39m\u001B[38;5;124m\"\u001B[39m, verbose\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m)\u001B[38;5;241m.\u001B[39mreshape(\u001B[38;5;241m1\u001B[39m, \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m)\n\u001B[1;32m 70\u001B[0m topic_words \u001B[38;5;241m=\u001B[39m mmr(topic_embedding, word_embeddings, words, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mdiversity, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtop_n_words)\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/bertopic/_bertopic.py:3408\u001B[0m, in \u001B[0;36mBERTopic._extract_embeddings\u001B[0;34m(self, documents, images, method, verbose)\u001B[0m\n\u001B[1;32m 3406\u001B[0m embeddings \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39membedding_model\u001B[38;5;241m.\u001B[39membed(documents\u001B[38;5;241m=\u001B[39mdocuments, images\u001B[38;5;241m=\u001B[39mimages, verbose\u001B[38;5;241m=\u001B[39mverbose)\n\u001B[1;32m 3407\u001B[0m \u001B[38;5;28;01melif\u001B[39;00m method \u001B[38;5;241m==\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mword\u001B[39m\u001B[38;5;124m\"\u001B[39m:\n\u001B[0;32m-> 3408\u001B[0m embeddings \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43membedding_model\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43membed_words\u001B[49m\u001B[43m(\u001B[49m\u001B[43mwords\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mdocuments\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mverbose\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mverbose\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 3409\u001B[0m \u001B[38;5;28;01melif\u001B[39;00m method \u001B[38;5;241m==\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mdocument\u001B[39m\u001B[38;5;124m\"\u001B[39m:\n\u001B[1;32m 3410\u001B[0m embeddings \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39membedding_model\u001B[38;5;241m.\u001B[39membed_documents(documents, verbose\u001B[38;5;241m=\u001B[39mverbose)\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/bertopic/backend/_base.py:53\u001B[0m, in \u001B[0;36mBaseEmbedder.embed_words\u001B[0;34m(self, words, verbose)\u001B[0m\n\u001B[1;32m 38\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21membed_words\u001B[39m(\u001B[38;5;28mself\u001B[39m,\n\u001B[1;32m 39\u001B[0m words: List[\u001B[38;5;28mstr\u001B[39m],\n\u001B[1;32m 40\u001B[0m verbose: \u001B[38;5;28mbool\u001B[39m \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mFalse\u001B[39;00m) \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m>\u001B[39m np\u001B[38;5;241m.\u001B[39mndarray:\n\u001B[1;32m 41\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\" Embed a list of n words into an n-dimensional\u001B[39;00m\n\u001B[1;32m 42\u001B[0m \u001B[38;5;124;03m matrix of embeddings\u001B[39;00m\n\u001B[1;32m 43\u001B[0m \n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 51\u001B[0m \n\u001B[1;32m 52\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[0;32m---> 53\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43membed\u001B[49m\u001B[43m(\u001B[49m\u001B[43mwords\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mverbose\u001B[49m\u001B[43m)\u001B[49m\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/bertopic/backend/_sentencetransformers.py:65\u001B[0m, in \u001B[0;36mSentenceTransformerBackend.embed\u001B[0;34m(self, documents, verbose)\u001B[0m\n\u001B[1;32m 51\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21membed\u001B[39m(\u001B[38;5;28mself\u001B[39m,\n\u001B[1;32m 52\u001B[0m documents: List[\u001B[38;5;28mstr\u001B[39m],\n\u001B[1;32m 53\u001B[0m verbose: \u001B[38;5;28mbool\u001B[39m \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mFalse\u001B[39;00m) \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m>\u001B[39m np\u001B[38;5;241m.\u001B[39mndarray:\n\u001B[1;32m 54\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\" Embed a list of n documents/words into an n-dimensional\u001B[39;00m\n\u001B[1;32m 55\u001B[0m \u001B[38;5;124;03m matrix of embeddings\u001B[39;00m\n\u001B[1;32m 56\u001B[0m \n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 63\u001B[0m \u001B[38;5;124;03m that each have an embeddings size of `m`\u001B[39;00m\n\u001B[1;32m 64\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[0;32m---> 65\u001B[0m embeddings \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43membedding_model\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mencode\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdocuments\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mshow_progress_bar\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mverbose\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 66\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m embeddings\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/sentence_transformers/SentenceTransformer.py:517\u001B[0m, in \u001B[0;36mSentenceTransformer.encode\u001B[0;34m(self, sentences, prompt_name, prompt, batch_size, show_progress_bar, output_value, precision, convert_to_numpy, convert_to_tensor, device, normalize_embeddings)\u001B[0m\n\u001B[1;32m 514\u001B[0m features\u001B[38;5;241m.\u001B[39mupdate(extra_features)\n\u001B[1;32m 516\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m torch\u001B[38;5;241m.\u001B[39mno_grad():\n\u001B[0;32m--> 517\u001B[0m out_features \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mforward\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfeatures\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 518\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mdevice\u001B[38;5;241m.\u001B[39mtype \u001B[38;5;241m==\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mhpu\u001B[39m\u001B[38;5;124m\"\u001B[39m:\n\u001B[1;32m 519\u001B[0m out_features \u001B[38;5;241m=\u001B[39m copy\u001B[38;5;241m.\u001B[39mdeepcopy(out_features)\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/torch/nn/modules/container.py:250\u001B[0m, in \u001B[0;36mSequential.forward\u001B[0;34m(self, input)\u001B[0m\n\u001B[1;32m 248\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mforward\u001B[39m(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;28minput\u001B[39m):\n\u001B[1;32m 249\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m module \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m:\n\u001B[0;32m--> 250\u001B[0m \u001B[38;5;28minput\u001B[39m \u001B[38;5;241m=\u001B[39m \u001B[43mmodule\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43minput\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[1;32m 251\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28minput\u001B[39m\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/torch/nn/modules/module.py:1736\u001B[0m, in \u001B[0;36mModule._wrapped_call_impl\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m 1734\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_compiled_call_impl(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs) \u001B[38;5;66;03m# type: ignore[misc]\u001B[39;00m\n\u001B[1;32m 1735\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m-> 1736\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_call_impl\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/torch/nn/modules/module.py:1747\u001B[0m, in \u001B[0;36mModule._call_impl\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m 1742\u001B[0m \u001B[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001B[39;00m\n\u001B[1;32m 1743\u001B[0m \u001B[38;5;66;03m# this function, and just call forward.\u001B[39;00m\n\u001B[1;32m 1744\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_pre_hooks\n\u001B[1;32m 1745\u001B[0m \u001B[38;5;129;01mor\u001B[39;00m _global_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_backward_hooks\n\u001B[1;32m 1746\u001B[0m \u001B[38;5;129;01mor\u001B[39;00m _global_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_forward_pre_hooks):\n\u001B[0;32m-> 1747\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mforward_call\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 1749\u001B[0m result \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[1;32m 1750\u001B[0m called_always_called_hooks \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mset\u001B[39m()\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/sentence_transformers/models/Transformer.py:118\u001B[0m, in \u001B[0;36mTransformer.forward\u001B[0;34m(self, features)\u001B[0m\n\u001B[1;32m 115\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtoken_type_ids\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;129;01min\u001B[39;00m features:\n\u001B[1;32m 116\u001B[0m trans_features[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtoken_type_ids\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m=\u001B[39m features[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtoken_type_ids\u001B[39m\u001B[38;5;124m\"\u001B[39m]\n\u001B[0;32m--> 118\u001B[0m output_states \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mauto_model\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mtrans_features\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mreturn_dict\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43;01mFalse\u001B[39;49;00m\u001B[43m)\u001B[49m\n\u001B[1;32m 119\u001B[0m output_tokens \u001B[38;5;241m=\u001B[39m output_states[\u001B[38;5;241m0\u001B[39m]\n\u001B[1;32m 121\u001B[0m features\u001B[38;5;241m.\u001B[39mupdate({\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtoken_embeddings\u001B[39m\u001B[38;5;124m\"\u001B[39m: output_tokens, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mattention_mask\u001B[39m\u001B[38;5;124m\"\u001B[39m: features[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mattention_mask\u001B[39m\u001B[38;5;124m\"\u001B[39m]})\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/torch/nn/modules/module.py:1736\u001B[0m, in \u001B[0;36mModule._wrapped_call_impl\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m 1734\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_compiled_call_impl(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs) \u001B[38;5;66;03m# type: ignore[misc]\u001B[39;00m\n\u001B[1;32m 1735\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m-> 1736\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_call_impl\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/torch/nn/modules/module.py:1747\u001B[0m, in \u001B[0;36mModule._call_impl\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m 1742\u001B[0m \u001B[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001B[39;00m\n\u001B[1;32m 1743\u001B[0m \u001B[38;5;66;03m# this function, and just call forward.\u001B[39;00m\n\u001B[1;32m 1744\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_pre_hooks\n\u001B[1;32m 1745\u001B[0m \u001B[38;5;129;01mor\u001B[39;00m _global_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_backward_hooks\n\u001B[1;32m 1746\u001B[0m \u001B[38;5;129;01mor\u001B[39;00m _global_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_forward_pre_hooks):\n\u001B[0;32m-> 1747\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mforward_call\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 1749\u001B[0m result \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[1;32m 1750\u001B[0m called_always_called_hooks \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mset\u001B[39m()\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/transformers/models/xlm_roberta/modeling_xlm_roberta.py:834\u001B[0m, in \u001B[0;36mXLMRobertaModel.forward\u001B[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001B[0m\n\u001B[1;32m 825\u001B[0m head_mask \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mget_head_mask(head_mask, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mconfig\u001B[38;5;241m.\u001B[39mnum_hidden_layers)\n\u001B[1;32m 827\u001B[0m embedding_output \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39membeddings(\n\u001B[1;32m 828\u001B[0m input_ids\u001B[38;5;241m=\u001B[39minput_ids,\n\u001B[1;32m 829\u001B[0m position_ids\u001B[38;5;241m=\u001B[39mposition_ids,\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 832\u001B[0m past_key_values_length\u001B[38;5;241m=\u001B[39mpast_key_values_length,\n\u001B[1;32m 833\u001B[0m )\n\u001B[0;32m--> 834\u001B[0m encoder_outputs \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mencoder\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m 835\u001B[0m \u001B[43m \u001B[49m\u001B[43membedding_output\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 836\u001B[0m \u001B[43m \u001B[49m\u001B[43mattention_mask\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mextended_attention_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 837\u001B[0m \u001B[43m \u001B[49m\u001B[43mhead_mask\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mhead_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 838\u001B[0m \u001B[43m \u001B[49m\u001B[43mencoder_hidden_states\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mencoder_hidden_states\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 839\u001B[0m \u001B[43m \u001B[49m\u001B[43mencoder_attention_mask\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mencoder_extended_attention_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 840\u001B[0m \u001B[43m \u001B[49m\u001B[43mpast_key_values\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mpast_key_values\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 841\u001B[0m \u001B[43m \u001B[49m\u001B[43muse_cache\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43muse_cache\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 842\u001B[0m \u001B[43m \u001B[49m\u001B[43moutput_attentions\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43moutput_attentions\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 843\u001B[0m \u001B[43m \u001B[49m\u001B[43moutput_hidden_states\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43moutput_hidden_states\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 844\u001B[0m \u001B[43m \u001B[49m\u001B[43mreturn_dict\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mreturn_dict\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 845\u001B[0m \u001B[43m\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 846\u001B[0m sequence_output \u001B[38;5;241m=\u001B[39m encoder_outputs[\u001B[38;5;241m0\u001B[39m]\n\u001B[1;32m 847\u001B[0m pooled_output \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mpooler(sequence_output) \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mpooler \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m \u001B[38;5;28;01melse\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/torch/nn/modules/module.py:1736\u001B[0m, in \u001B[0;36mModule._wrapped_call_impl\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m 1734\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_compiled_call_impl(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs) \u001B[38;5;66;03m# type: ignore[misc]\u001B[39;00m\n\u001B[1;32m 1735\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m-> 1736\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_call_impl\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/torch/nn/modules/module.py:1747\u001B[0m, in \u001B[0;36mModule._call_impl\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m 1742\u001B[0m \u001B[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001B[39;00m\n\u001B[1;32m 1743\u001B[0m \u001B[38;5;66;03m# this function, and just call forward.\u001B[39;00m\n\u001B[1;32m 1744\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_pre_hooks\n\u001B[1;32m 1745\u001B[0m \u001B[38;5;129;01mor\u001B[39;00m _global_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_backward_hooks\n\u001B[1;32m 1746\u001B[0m \u001B[38;5;129;01mor\u001B[39;00m _global_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_forward_pre_hooks):\n\u001B[0;32m-> 1747\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mforward_call\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 1749\u001B[0m result \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[1;32m 1750\u001B[0m called_always_called_hooks \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mset\u001B[39m()\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/transformers/models/xlm_roberta/modeling_xlm_roberta.py:522\u001B[0m, in \u001B[0;36mXLMRobertaEncoder.forward\u001B[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001B[0m\n\u001B[1;32m 511\u001B[0m layer_outputs \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_gradient_checkpointing_func(\n\u001B[1;32m 512\u001B[0m layer_module\u001B[38;5;241m.\u001B[39m\u001B[38;5;21m__call__\u001B[39m,\n\u001B[1;32m 513\u001B[0m hidden_states,\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 519\u001B[0m output_attentions,\n\u001B[1;32m 520\u001B[0m )\n\u001B[1;32m 521\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m--> 522\u001B[0m layer_outputs \u001B[38;5;241m=\u001B[39m \u001B[43mlayer_module\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m 523\u001B[0m \u001B[43m \u001B[49m\u001B[43mhidden_states\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 524\u001B[0m \u001B[43m \u001B[49m\u001B[43mattention_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 525\u001B[0m \u001B[43m \u001B[49m\u001B[43mlayer_head_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 526\u001B[0m \u001B[43m \u001B[49m\u001B[43mencoder_hidden_states\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 527\u001B[0m \u001B[43m \u001B[49m\u001B[43mencoder_attention_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 528\u001B[0m \u001B[43m \u001B[49m\u001B[43mpast_key_value\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 529\u001B[0m \u001B[43m \u001B[49m\u001B[43moutput_attentions\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 530\u001B[0m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 532\u001B[0m hidden_states \u001B[38;5;241m=\u001B[39m layer_outputs[\u001B[38;5;241m0\u001B[39m]\n\u001B[1;32m 533\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m use_cache:\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/torch/nn/modules/module.py:1736\u001B[0m, in \u001B[0;36mModule._wrapped_call_impl\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m 1734\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_compiled_call_impl(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs) \u001B[38;5;66;03m# type: ignore[misc]\u001B[39;00m\n\u001B[1;32m 1735\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m-> 1736\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_call_impl\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/torch/nn/modules/module.py:1747\u001B[0m, in \u001B[0;36mModule._call_impl\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m 1742\u001B[0m \u001B[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001B[39;00m\n\u001B[1;32m 1743\u001B[0m \u001B[38;5;66;03m# this function, and just call forward.\u001B[39;00m\n\u001B[1;32m 1744\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_pre_hooks\n\u001B[1;32m 1745\u001B[0m \u001B[38;5;129;01mor\u001B[39;00m _global_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_backward_hooks\n\u001B[1;32m 1746\u001B[0m \u001B[38;5;129;01mor\u001B[39;00m _global_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_forward_pre_hooks):\n\u001B[0;32m-> 1747\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mforward_call\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 1749\u001B[0m result \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[1;32m 1750\u001B[0m called_always_called_hooks \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mset\u001B[39m()\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/transformers/models/xlm_roberta/modeling_xlm_roberta.py:453\u001B[0m, in \u001B[0;36mXLMRobertaLayer.forward\u001B[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001B[0m\n\u001B[1;32m 450\u001B[0m cross_attn_present_key_value \u001B[38;5;241m=\u001B[39m cross_attention_outputs[\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m]\n\u001B[1;32m 451\u001B[0m present_key_value \u001B[38;5;241m=\u001B[39m present_key_value \u001B[38;5;241m+\u001B[39m cross_attn_present_key_value\n\u001B[0;32m--> 453\u001B[0m layer_output \u001B[38;5;241m=\u001B[39m \u001B[43mapply_chunking_to_forward\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m 454\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfeed_forward_chunk\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mchunk_size_feed_forward\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mseq_len_dim\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mattention_output\u001B[49m\n\u001B[1;32m 455\u001B[0m \u001B[43m\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 456\u001B[0m outputs \u001B[38;5;241m=\u001B[39m (layer_output,) \u001B[38;5;241m+\u001B[39m outputs\n\u001B[1;32m 458\u001B[0m \u001B[38;5;66;03m# if decoder, return the attn key/values as the last output\u001B[39;00m\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/transformers/pytorch_utils.py:239\u001B[0m, in \u001B[0;36mapply_chunking_to_forward\u001B[0;34m(forward_fn, chunk_size, chunk_dim, *input_tensors)\u001B[0m\n\u001B[1;32m 236\u001B[0m \u001B[38;5;66;03m# concatenate output at same dimension\u001B[39;00m\n\u001B[1;32m 237\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m torch\u001B[38;5;241m.\u001B[39mcat(output_chunks, dim\u001B[38;5;241m=\u001B[39mchunk_dim)\n\u001B[0;32m--> 239\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mforward_fn\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43minput_tensors\u001B[49m\u001B[43m)\u001B[49m\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/transformers/models/xlm_roberta/modeling_xlm_roberta.py:465\u001B[0m, in \u001B[0;36mXLMRobertaLayer.feed_forward_chunk\u001B[0;34m(self, attention_output)\u001B[0m\n\u001B[1;32m 464\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mfeed_forward_chunk\u001B[39m(\u001B[38;5;28mself\u001B[39m, attention_output):\n\u001B[0;32m--> 465\u001B[0m intermediate_output \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mintermediate\u001B[49m\u001B[43m(\u001B[49m\u001B[43mattention_output\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 466\u001B[0m layer_output \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39moutput(intermediate_output, attention_output)\n\u001B[1;32m 467\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m layer_output\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/torch/nn/modules/module.py:1736\u001B[0m, in \u001B[0;36mModule._wrapped_call_impl\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m 1734\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_compiled_call_impl(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs) \u001B[38;5;66;03m# type: ignore[misc]\u001B[39;00m\n\u001B[1;32m 1735\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m-> 1736\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_call_impl\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/torch/nn/modules/module.py:1747\u001B[0m, in \u001B[0;36mModule._call_impl\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m 1742\u001B[0m \u001B[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001B[39;00m\n\u001B[1;32m 1743\u001B[0m \u001B[38;5;66;03m# this function, and just call forward.\u001B[39;00m\n\u001B[1;32m 1744\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_pre_hooks\n\u001B[1;32m 1745\u001B[0m \u001B[38;5;129;01mor\u001B[39;00m _global_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_backward_hooks\n\u001B[1;32m 1746\u001B[0m \u001B[38;5;129;01mor\u001B[39;00m _global_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_forward_pre_hooks):\n\u001B[0;32m-> 1747\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mforward_call\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 1749\u001B[0m result \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[1;32m 1750\u001B[0m called_always_called_hooks \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mset\u001B[39m()\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/transformers/models/xlm_roberta/modeling_xlm_roberta.py:363\u001B[0m, in \u001B[0;36mXLMRobertaIntermediate.forward\u001B[0;34m(self, hidden_states)\u001B[0m\n\u001B[1;32m 362\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mforward\u001B[39m(\u001B[38;5;28mself\u001B[39m, hidden_states: torch\u001B[38;5;241m.\u001B[39mTensor) \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m>\u001B[39m torch\u001B[38;5;241m.\u001B[39mTensor:\n\u001B[0;32m--> 363\u001B[0m hidden_states \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mdense\u001B[49m\u001B[43m(\u001B[49m\u001B[43mhidden_states\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 364\u001B[0m hidden_states \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mintermediate_act_fn(hidden_states)\n\u001B[1;32m 365\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m hidden_states\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/torch/nn/modules/module.py:1736\u001B[0m, in \u001B[0;36mModule._wrapped_call_impl\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m 1734\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_compiled_call_impl(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs) \u001B[38;5;66;03m# type: ignore[misc]\u001B[39;00m\n\u001B[1;32m 1735\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m-> 1736\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_call_impl\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/torch/nn/modules/module.py:1747\u001B[0m, in \u001B[0;36mModule._call_impl\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m 1742\u001B[0m \u001B[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001B[39;00m\n\u001B[1;32m 1743\u001B[0m \u001B[38;5;66;03m# this function, and just call forward.\u001B[39;00m\n\u001B[1;32m 1744\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_pre_hooks\n\u001B[1;32m 1745\u001B[0m \u001B[38;5;129;01mor\u001B[39;00m _global_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_backward_hooks\n\u001B[1;32m 1746\u001B[0m \u001B[38;5;129;01mor\u001B[39;00m _global_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_forward_pre_hooks):\n\u001B[0;32m-> 1747\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mforward_call\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 1749\u001B[0m result \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[1;32m 1750\u001B[0m called_always_called_hooks \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mset\u001B[39m()\n", + "File \u001B[0;32m~/miniconda3/envs/bertrend/lib/python3.11/site-packages/torch/nn/modules/linear.py:125\u001B[0m, in \u001B[0;36mLinear.forward\u001B[0;34m(self, input)\u001B[0m\n\u001B[1;32m 124\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mforward\u001B[39m(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;28minput\u001B[39m: Tensor) \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m>\u001B[39m Tensor:\n\u001B[0;32m--> 125\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mF\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mlinear\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43minput\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mweight\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mbias\u001B[49m\u001B[43m)\u001B[49m\n", + "\u001B[0;31mKeyboardInterrupt\u001B[0m: " + ] + } + ], + "execution_count": 57 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-20T15:07:17.477460801Z", + "start_time": "2025-01-09T12:39:13.122511Z" + } + }, + "cell_type": "code", + "source": "", + "id": "c160fcd13af19743", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "", + "id": "287113e909a99115" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index 4147310..9b5aae9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ bertopic = "0.16.2" black = "^24.10.0" datamapplot = "0.3.0" dateparser = "^1.2.0" -dask = "2024.9.1" # issues with >=2025.x (https://github.com/dask/dask/issues/11678) +dask = "2024.12.0" # issues with >=2025.x (https://github.com/dask/dask/issues/11678) gensim = "4.3.2" hdbscan = "^0.8.40" joblib = "^1.4.2" @@ -38,6 +38,7 @@ markdown = "^3.7" nltk = "^3.9.1" numpy = "<2" openai = "^1.58.1" +opentelemetry-exporter-otlp-proto-grpc = "1.25.0" # to avoid error chroma with protobuf pandas = "^2.2.2" plotly = "^5.24.1" plotly-resampler = "^0.10.0"