Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] First version of curebot bertopic refactor #24

Merged
merged 29 commits into from
Feb 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
04a2a39
Simplified typing
picaultj Jan 20, 2025
a11cd28
Refactor BERTrend (moved functions inside the class)
picaultj Jan 20, 2025
5676c32
Refactor generate_topic_description (JSON output)
picaultj Jan 20, 2025
8d6b632
Simplification of parameters of analyze_signal
picaultj Jan 20, 2025
7009d59
Updated train topics function for incremental topic learning
picaultj Jan 27, 2025
e463cf5
Improvements notebook, bug fix in merge_all_models
picaultj Jan 27, 2025
56d1a2e
:art: Format Python code with psf/black
invalid-email-address Jan 27, 2025
fb5d043
Bugix: In some cases, there are no word description for the topic whi…
picaultj Jan 27, 2025
c52f7a5
WIP: Management of data feeds
picaultj Jan 31, 2025
c39e88d
WIP: first version of script for partial training
picaultj Feb 4, 2025
69e89f9
WIP
picaultj Feb 5, 2025
2ffd01e
Simplification / improvement to BERTrend serialization
picaultj Feb 6, 2025
311b844
WIP: dashboard
picaultj Feb 7, 2025
ec016cf
Various bug fixes
picaultj Feb 8, 2025
88b1f67
Updated config for model analysis
picaultj Feb 8, 2025
09c224b
Bug fixes
picaultj Feb 10, 2025
c13ee2b
Bug fix: problem of indices between the html analysis and the title/d…
picaultj Feb 10, 2025
829f71c
Bug fix: language was not correctly taken into account
picaultj Feb 11, 2025
0beff96
Updated BERTrend default path: .../bertrend/data|config|logs etc.
picaultj Feb 11, 2025
1efd89d
[WIP] First version of curebot bertopic refactor
grosjeang Feb 7, 2025
c0da490
Tags usage is now an option
grosjeang Feb 10, 2025
73ed11b
Add newsletter tab to curebot demonstrator
grosjeang Feb 11, 2025
330df9c
Remove useless code
grosjeang Feb 11, 2025
cb98592
Remove useless code
grosjeang Feb 11, 2025
ec819e2
Minor changes
grosjeang Feb 11, 2025
fff73b2
Add support for newsletter edit and download
grosjeang Feb 13, 2025
91c9bc0
Minor improvements
grosjeang Feb 13, 2025
a5feab7
Flag useless file
grosjeang Feb 24, 2025
7433b63
Update tests
grosjeang Feb 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 40 additions & 20 deletions bertrend/BERTopicModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,19 +68,39 @@ class BERTopicModel:
Utility class to manage and configure BERTopic instances with custom parameters.
"""

def __init__(self, config_file: str | Path = BERTOPIC_DEFAULT_CONFIG_PATH):
def __init__(self, config: str | Path | dict = BERTOPIC_DEFAULT_CONFIG_PATH):
"""
Initialize a class from a TOML config file.
`config_file` can be:
`config` can be:
- a `str` representing the TOML file
- a `Path` to a TOML file
- a `dict` (with the same structure of the default config) containing values to be overridden compared to the default configuration

To see file format and list of parameters: bertrend/config/topic_model_default_config.toml
"""
self.config_file = config_file
if isinstance(config, str) or isinstance(config, Path):
try:
self.config = load_toml_config(config)
except Exception as e:
raise Exception(f"Failed to load TOML config: {e}")
elif isinstance(config, dict):
# load default config
self.config = load_toml_config(BERTOPIC_DEFAULT_CONFIG_PATH)
# overrides keys with provided dict
for section, settings in config.items():
if section in config:
self.config[section].update(
settings
) # Update the settings in that section
else:
self.config[section] = settings # If section doesn't exist, add it
else:
raise TypeError(
f"Config must be a string, Path or dict object, got: {type(config)}"
)

# Load config file
self.config = self._load_config()
# Update config file (depending on language, etc.)
self._update_config()

# Initialize models based on those parameters
self._initialize_models()
Expand All @@ -92,34 +112,34 @@ def __init__(self, config_file: str | Path = BERTOPIC_DEFAULT_CONFIG_PATH):
)
)

def _load_config(self) -> dict:
@classmethod
def get_default_config(cls) -> dict:
"""Helper function to get default config. Useful to modify a s"""
return load_toml_config(BERTOPIC_DEFAULT_CONFIG_PATH)

def _update_config(self):
"""
Load the TOML config file as a dict when initializing the class.
Update the config file depending on initially loaded parameters.
"""
config = load_toml_config(self.config_file)

# Handle specific parameters

# Transform ngram_range into tuple
if config["vectorizer_model"].get("ngram_range"):
config["vectorizer_model"]["ngram_range"] = tuple(
config["vectorizer_model"]["ngram_range"]
if self.config["vectorizer_model"].get("ngram_range"):
self.config["vectorizer_model"]["ngram_range"] = tuple(
self.config["vectorizer_model"]["ngram_range"]
)

# Load stop words list
if config["vectorizer_model"].get("stop_words"):
if self.config["vectorizer_model"].get("stop_words"):
stop_words = (
STOPWORDS
if config["global"]["language"] == "French"
if self.config["global"]["language"] == "French"
else ENGLISH_STOPWORDS
)
config["vectorizer_model"]["stop_words"] = stop_words
self.config["vectorizer_model"]["stop_words"] = stop_words

# BERTopic needs a "None" instead of an empty list, otherwise it'll attempt zeroshot topic modeling on an empty list
if not config["bertopic_model"].get("zeroshot_topic_list"): # empty list
config["bertopic_model"]["zeroshot_topic_list"] = None

return config
if not self.config["bertopic_model"].get("zeroshot_topic_list"): # empty list
self.config["bertopic_model"]["zeroshot_topic_list"] = None

def _initialize_models(self):
self.umap_model = UMAP(**self.config["umap_model"])
Expand Down
Loading