Skip to content

Commit

Permalink
[Refactor] Reorganised code structure
Browse files Browse the repository at this point in the history
  • Loading branch information
picaultj committed Dec 17, 2024
1 parent 4d4e752 commit bcb1f6e
Show file tree
Hide file tree
Showing 48 changed files with 251 additions and 243 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ The code base is divided into two main demonstrators:

### Dynamic Topic Modeling Analysis

- [bertrend/topic_analysis](bertrend/topic_analysis): Focuses on dynamic topic modeling analysis using:
- [bertrend/topic_analysis](bertrend/demos/topic_analysis): Focuses on dynamic topic modeling analysis using:
- Dedicated metrics, called TEMPTopic:
- Stability Evaluation: Measures the stability of topics over time, evaluating how consistent and coherent topics remain.
- Volatility Assessment: Analyzes the volatility of topics, identifying how much topics change over different time periods.
Expand All @@ -57,7 +57,7 @@ https://github.com/user-attachments/assets/f600f666-a6da-40be-8b07-5041b3bde1dc

### Weak Signal Analysis

- [bertrend/weak_signals](bertrend/weak_signals): Identifies and analyzes emerging trends and signals
- [bertrend/weak_signals](bertrend/demos/weak_signals): Identifies and analyzes emerging trends and signals

#### Weak Signal Analysis Demonstrator Video
https://github.com/user-attachments/assets/d79368d9-d4e0-4324-8a98-a888f0ab3b65
Expand Down Expand Up @@ -189,7 +189,7 @@ Key Features of the demonstrator:
#### Launch the Weak Signal Analysis Demonstrator
```bash
cd bertrend/weak_signals
CUDA_VISIBLE_DEVICES=<gpu_number> streamlit run app.py
CUDA_VISIBLE_DEVICES=<gpu_number> streamlit run summarizer_app.py
```

#### Data Preparation
Expand Down
6 changes: 3 additions & 3 deletions bertrend/bertrend.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,17 @@
from sentence_transformers import SentenceTransformer

from bertrend.topic_model import TopicModel
from bertrend.utils import TEXT_COLUMN
from bertrend.parameters import (
DEFAULT_MIN_SIMILARITY,
DEFAULT_GRANULARITY,
)
from bertrend.weak_signals.topic_modeling import preprocess_model, merge_models
from bertrend.weak_signals.weak_signals import (
from bertrend.trend_analysis.topic_modeling import preprocess_model, merge_models
from bertrend.trend_analysis.weak_signals import (
_initialize_new_topic,
update_existing_topic,
_apply_decay_to_inactive_topics,
)
from bertrend.utils.data_loading import TEXT_COLUMN


class BERTrend:
Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import streamlit as st
from loguru import logger

from bertrend.summary import AbstractiveSummarizer
from bertrend.summary.abstractive_summarizer import AbstractiveSummarizer
from bertrend.summary.chatgpt_summarizer import GPTSummarizer
from bertrend.summary.extractive_summarizer import (
EnhancedExtractiveSummarizer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from bertrend import DATA_PATH

from bertrend.topic_analysis.app_utils import (
from bertrend.demos.topic_analysis.app_utils import (
embedding_model_options,
bertopic_options,
umap_options,
Expand All @@ -26,22 +26,22 @@
representation_model_options,
load_data_wrapper,
)
from bertrend.topic_analysis.data_utils import data_overview, choose_data
from bertrend.topic_analysis.state_utils import (
from bertrend.demos.topic_analysis.data_utils import data_overview, choose_data
from bertrend.demos.topic_analysis.state_utils import (
register_widget,
save_widget_state,
restore_widget_state,
)

from bertrend.metrics.topic_metrics import get_coherence_value, get_diversity_value
from bertrend.train import train_BERTopic
from bertrend.utils import (
clean_dataset,
preprocess_french_text,
from bertrend.utils.data_loading import (
split_df_by_paragraphs,
TEXT_COLUMN,
TIMESTAMP_COLUMN,
URL_COLUMN,
TEXT_COLUMN,
preprocess_french_text,
clean_dataset,
)


Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@
import plotly.express as px
import streamlit as st

from bertrend.topic_analysis.state_utils import register_widget
from bertrend.utils import (
PLOTLY_BUTTON_SAVE_CONFIG,
TEXT_COLUMN,
from bertrend.demos.topic_analysis.state_utils import register_widget
from bertrend.trend_analysis.visualizations import PLOTLY_BUTTON_SAVE_CONFIG
from bertrend.utils.data_loading import (
load_data,
TIMESTAMP_COLUMN,
GROUPED_TIMESTAMP_COLUMN,
URL_COLUMN,
TEXT_COLUMN,
TITLE_COLUMN,
URL_COLUMN,
CITATION_COUNT_COL,
load_data,
)

# Default configuration parameters for the application
Expand Down Expand Up @@ -304,7 +304,7 @@ def plot_topics_hierarchy(form_parameters, _topic_model, width=700):

def make_dynamic_topics_split(df, nr_bins):
"""
Split docs into nr_bins and generate a common timestamp label into a new column
Split docs into nr_bins and generate a llm_utils timestamp label into a new column
"""
df = df.sort_values(TIMESTAMP_COLUMN, ascending=False)
split_df = np.array_split(df, nr_bins)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
import streamlit as st
from pathlib import Path

from bertrend.topic_analysis.app_utils import plot_docs_reparition_over_time
from bertrend.topic_analysis.state_utils import save_widget_state
from bertrend.utils import TEXT_COLUMN, TIMESTAMP_COLUMN
from bertrend.demos.topic_analysis.app_utils import plot_docs_reparition_over_time
from bertrend.demos.topic_analysis.state_utils import save_widget_state
from bertrend.utils.data_loading import TEXT_COLUMN, TIMESTAMP_COLUMN


def data_overview(df: pd.DataFrame):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,16 @@

from loguru import logger

from bertrend.topic_analysis.state_utils import restore_widget_state
from bertrend.common.openai_client import OpenAI_Client
from bertrend.demos.topic_analysis.state_utils import restore_widget_state
from bertrend.llm_utils.openai_client import OpenAI_Client
from bertrend.trend_analysis.visualizations import PLOTLY_BUTTON_SAVE_CONFIG
from bertrend.utils.data_loading import TIMESTAMP_COLUMN, TEXT_COLUMN
from bertrend_apps.newsletters.newsletter_features import get_most_representative_docs
from bertrend.topic_analysis.app_utils import (
from bertrend.demos.topic_analysis.app_utils import (
compute_topics_over_time,
plot_topics_over_time,
print_docs_for_specific_topic,
)
from bertrend.utils import (
PLOTLY_BUTTON_SAVE_CONFIG,
TEXT_COLUMN,
TIMESTAMP_COLUMN,
)


def generate_topic_description(topic_model, topic_number, filtered_docs):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import streamlit as st
from pathlib import Path

from bertrend.topic_analysis.state_utils import (
from bertrend.demos.topic_analysis.state_utils import (
restore_widget_state,
register_widget,
save_widget_state,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@
from loguru import logger
from umap import UMAP

from bertrend.topic_analysis.app_utils import (
from bertrend.demos.topic_analysis.app_utils import (
plot_2d_topics,
)
from bertrend.topic_analysis.state_utils import restore_widget_state
from bertrend.utils import TEXT_COLUMN, PLOTLY_BUTTON_SAVE_CONFIG
from bertrend.demos.topic_analysis.state_utils import restore_widget_state
from bertrend.trend_analysis.visualizations import PLOTLY_BUTTON_SAVE_CONFIG
from bertrend.utils.data_loading import TEXT_COLUMN

# Set locale for French date names
locale.setlocale(locale.LC_TIME, "fr_FR.UTF-8")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,17 @@
import umap

from bertrend.metrics.temporal_metrics_embedding import TempTopic
from bertrend.topic_analysis.app_utils import (
from bertrend.demos.topic_analysis.app_utils import (
plot_topics_over_time,
compute_topics_over_time,
)
from bertrend.topic_analysis.state_utils import (
from bertrend.demos.topic_analysis.state_utils import (
register_widget,
save_widget_state,
restore_widget_state,
)
from bertrend.utils import TIMESTAMP_COLUMN, TEXT_COLUMN, PLOTLY_BUTTON_SAVE_CONFIG
from bertrend.trend_analysis.visualizations import PLOTLY_BUTTON_SAVE_CONFIG
from bertrend.utils.data_loading import TIMESTAMP_COLUMN, TEXT_COLUMN

# Set locale for French date names
locale.setlocale(locale.LC_TIME, "fr_FR.UTF-8")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@
import streamlit as st
from statistics import StatisticsError

from bertrend.topic_analysis.app_utils import compute_topics_over_time
from bertrend.topic_analysis.state_utils import (
from bertrend.demos.topic_analysis.app_utils import compute_topics_over_time
from bertrend.demos.topic_analysis.state_utils import (
restore_widget_state,
register_widget,
save_widget_state,
)
from bertrend.metrics.metrics import TIME_WEIGHT, TopicMetrics
from bertrend.utils import TIMESTAMP_COLUMN, PLOTLY_BUTTON_SAVE_CONFIG
from bertrend.trend_analysis.visualizations import PLOTLY_BUTTON_SAVE_CONFIG
from bertrend.utils.data_loading import TIMESTAMP_COLUMN

# Restore widget state
restore_widget_state()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@
import pandas as pd
import streamlit as st

from bertrend.topic_analysis.app_utils import (
from bertrend.demos.topic_analysis.app_utils import (
plot_remaining_docs_repartition_over_time,
transform_new_data,
compute_topics_over_time,
plot_topics_over_time,
)
from bertrend.topic_analysis.state_utils import (
from bertrend.demos.topic_analysis.state_utils import (
restore_widget_state,
register_widget,
save_widget_state,
)
from bertrend.metrics.metrics import TIME_WEIGHT, TopicMetrics, TEM_x, TEM_y
from bertrend.utils import TIMESTAMP_COLUMN
from bertrend.utils.data_loading import TIMESTAMP_COLUMN

# Restore widget state
restore_widget_state()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from bertopic import BERTopic
from typing import List, Optional, Union

from bertrend.topic_analysis.state_utils import restore_widget_state
from bertrend.demos.topic_analysis.state_utils import restore_widget_state


def list_saved_models(saved_models_dir: Union[str, Path]) -> List[Path]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,20 @@
# from langchain_openai import ChatOpenAI
# from langchain_core.messages import HumanMessage, SystemMessage

from bertrend.topic_analysis.app_utils import (
from bertrend.demos.topic_analysis.app_utils import (
plot_topics_over_time,
compute_topics_over_time,
)

import pandas as pd

from bertrend.topic_analysis.state_utils import (
from bertrend.demos.topic_analysis.state_utils import (
restore_widget_state,
register_widget,
save_widget_state,
)
from bertrend.metrics.temporal_metrics import TempTopic
from bertrend.utils import TEXT_COLUMN, TIMESTAMP_COLUMN
from bertrend.utils.data_loading import TIMESTAMP_COLUMN, TEXT_COLUMN


def display_documents_on_click(clicked_point):
Expand Down
File renamed without changes.
File renamed without changes.
4 changes: 4 additions & 0 deletions bertrend/demos/weak_signals/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) 2024, RTE (https://www.rte-france.com)
# See AUTHORS.txt
# SPDX-License-Identifier: MPL-2.0
# This file is part of BERTrend.
32 changes: 17 additions & 15 deletions bertrend/weak_signals/app.py → bertrend/demos/weak_signals/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
SIGNAL_EVOLUTION_DATA_DIR,
WEAK_SIGNALS_CACHE_PATH,
)
from bertrend.bertrend import BERTrend, calculate_signal_popularity
from bertrend.embedding_service import EmbeddingService
from bertrend.bertrend import BERTrend
from bertrend.services.embedding_service import EmbeddingService
from bertrend.topic_model import TopicModel
from bertrend.weak_signals.messages import (
from bertrend.demos.weak_signals.messages import (
MODEL_MERGING_COMPLETE_MESSAGE,
NO_CACHE_WARNING,
CACHE_PURGED_MESSAGE,
Expand All @@ -38,27 +38,29 @@
NO_GRANULARITY_WARNING,
NO_DATASET_WARNING,
)
from data_loading import load_and_preprocess_data, group_by_days, find_compatible_files
from bertrend.trend_analysis.weak_signals import (
detect_weak_signals_zeroshot,
save_signal_evolution_data,
analyze_signal,
)
from bertrend.utils.data_loading import (
load_and_preprocess_data,
group_by_days,
find_compatible_files,
TEXT_COLUMN,
)
from bertrend.parameters import *
from session_state_manager import SessionStateManager
from topic_modeling import (
merge_models,
preprocess_model,
)
from visualizations import (
from bertrend.trend_analysis.visualizations import (
plot_num_topics_and_outliers,
plot_topics_per_timestamp,
plot_topic_size_evolution,
create_topic_size_evolution_figure,
plot_newly_emerged_topics,
create_sankey_diagram,
PLOTLY_BUTTON_SAVE_CONFIG,
)
from bertrend.utils import PLOTLY_BUTTON_SAVE_CONFIG, TEXT_COLUMN
from weak_signals import (
detect_weak_signals_zeroshot,
analyze_signal,
save_signal_evolution_data,
)


# UI Settings
PAGE_TITLE = "BERTopic Topic Detection"
Expand Down
File renamed without changes.
4 changes: 4 additions & 0 deletions bertrend/llm_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) 2024, RTE (https://www.rte-france.com)
# See AUTHORS.txt
# SPDX-License-Identifier: MPL-2.0
# This file is part of BERTrend.
File renamed without changes.
2 changes: 1 addition & 1 deletion bertrend/metrics/temporal_metrics_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -983,7 +983,7 @@ def _aggressive_text_preprocessing(self, text: str) -> str:
- Replacing hyphens and similar characters with spaces
- Removing specific prefixes
- Removing all punctuation
- Replacing special characters with spaces (preserving accented characters, common Latin extensions, and newlines)
- Replacing special characters with spaces (preserving accented characters, llm_utils Latin extensions, and newlines)
- Normalizing superscripts and subscripts
- Splitting words containing capitals in the middle (while avoiding splitting fully capitalized words)
- Lowercasing all text
Expand Down
4 changes: 4 additions & 0 deletions bertrend/services/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) 2024, RTE (https://www.rte-france.com)
# See AUTHORS.txt
# SPDX-License-Identifier: MPL-2.0
# This file is part of BERTrend.
File renamed without changes.
2 changes: 1 addition & 1 deletion bertrend/summary/chatgpt_summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from loguru import logger

from bertrend.common.openai_client import OpenAI_Client
from bertrend.llm_utils.openai_client import OpenAI_Client
from bertrend.summary.prompts import (
FR_SYSTEM_SUMMARY_SENTENCES,
EN_SYSTEM_SUMMARY_SENTENCES,
Expand Down
2 changes: 1 addition & 1 deletion bertrend/summary/extractive_summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from sentence_transformers.models import Transformer, Pooling
from torch import Tensor

from bertrend.common.openai_client import OpenAI_Client
from bertrend.llm_utils.openai_client import OpenAI_Client
from bertrend.summary.lexrank import degree_centrality_scores
from bertrend.summary.prompts import (
FR_SYSTEM_SUMMARY_SENTENCES,
Expand Down
4 changes: 4 additions & 0 deletions bertrend/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) 2024, RTE (https://www.rte-france.com)
# See AUTHORS.txt
# SPDX-License-Identifier: MPL-2.0
# This file is part of BERTrend.
Loading

0 comments on commit bcb1f6e

Please sign in to comment.