Update documentation to v2.4 (#274)

* First version of the doc * Apply suggestions from code review Co-authored-by: Lindsay Brin <lindsay.brin@servicenow.com> * Add screenshot, rename sections and revert changes * Add release notes * Clean changelog * Apply suggestions from code review Co-authored-by: Joseph Marinier <joseph.marinier@servicenow.com> Co-authored-by: Lindsay Brin <lindsay.brin@servicenow.com> * Remove old image * Update docs/docs/user-guide/exploration-space/index.md Co-authored-by: Lindsay Brin <lindsay.brin@servicenow.com> Co-authored-by: Lindsay Brin <lindsay.brin@servicenow.com> Co-authored-by: Joseph Marinier <joseph.marinier@servicenow.com>
ServiceNow · Oct 20, 2022 · ec8a280 · ec8a280
1 parent 2b6ea60
commit ec8a280
Show file tree

Hide file tree

Showing 31 changed files with 81 additions and 64 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,13 +11,8 @@ Released changes are shown in the
 ## [Not released]
 
 ### Added
-- New field in the config to block changes to the config when the app is launched.
-- Added new class imbalance warnings
-- Added a new page to compare different pipelines in the performance analysis table.
 
 ### Changed
-- Added a proposed action: `merge_classes`, and rename `consider_new_class` to `define_new_class`.
-- The order of the classes in the confusion matrix is now smarter: classes where the model gets similarly confused will be closer to one another. The rejection class is always the last row/column in the confusion matrix. A toggle allows the user to keep the original order from the dataset if preferred.
 
 ### Deprecated/Breaking Changes
 

diff --git a/CITATION.cff b/CITATION.cff
@@ -18,7 +18,7 @@ authors:
 - family-names: "Babu"
   given-names: "Nandhini"
 title: "Azimuth, an open-source dataset and error analysis tool for text classification"
-version: 2.3
+version: 2.4
 doi: 10.5281/zenodo.6511558
 date-released: 2022-08-17
 url: "https://github.com/ServiceNow/azimuth"
diff --git a/docs/docs/_static/images/behavioral-testing-summary.png b/docs/docs/_static/images/behavioral-testing-summary.png
diff --git a/docs/docs/_static/images/dashboard/behavioral-testing.png b/docs/docs/_static/images/dashboard/behavioral-testing.png
diff --git a/docs/docs/_static/images/dashboard/dashboard.png b/docs/docs/_static/images/dashboard/dashboard.png
diff --git a/docs/docs/_static/images/dashboard/dataset-class-distribution-analysis.png b/docs/docs/_static/images/dashboard/dataset-class-distribution-analysis.png
diff --git a/docs/docs/_static/images/dashboard/dataset-warnings.png b/docs/docs/_static/images/dashboard/dataset-warnings.png
diff --git a/docs/docs/_static/images/dashboard/performance-analysis.png b/docs/docs/_static/images/dashboard/performance-analysis.png
diff --git a/docs/docs/_static/images/dashboard/pipeline-metrics.png b/docs/docs/_static/images/dashboard/pipeline-metrics.png
diff --git a/.../docs/_static/images/dataset-class-distribution-analysis/dataset-warnings-1.png b/.../docs/_static/images/dataset-class-distribution-analysis/dataset-warnings-1.png
diff --git a/.../docs/_static/images/dataset-class-distribution-analysis/dataset-warnings-2.png b/.../docs/_static/images/dataset-class-distribution-analysis/dataset-warnings-2.png
diff --git a/.../docs/_static/images/dataset-class-distribution-analysis/dataset-warnings-3.png b/.../docs/_static/images/dataset-class-distribution-analysis/dataset-warnings-3.png
diff --git a/docs/docs/_static/images/dataset-warnings/dataset-warnings-1.png b/docs/docs/_static/images/dataset-warnings/dataset-warnings-1.png
diff --git a/docs/docs/_static/images/dataset-warnings/dataset-warnings-2.png b/docs/docs/_static/images/dataset-warnings/dataset-warnings-2.png
diff --git a/docs/docs/_static/images/dataset-warnings/dataset-warnings-3.png b/docs/docs/_static/images/dataset-warnings/dataset-warnings-3.png
diff --git a/docs/docs/_static/images/dataset-warnings/dataset-warnings-4.png b/docs/docs/_static/images/dataset-warnings/dataset-warnings-4.png
diff --git a/docs/docs/_static/images/exploration-space/performance-overview.png b/docs/docs/_static/images/exploration-space/performance-overview.png
diff --git a/docs/docs/_static/images/exploration-space/prediction-overview.png b/docs/docs/_static/images/exploration-space/prediction-overview.png
diff --git a/...es/exploration-space/utterances-table.png → ...ges/exploration-space/utterance-table.png b/...es/exploration-space/utterances-table.png → ...ges/exploration-space/utterance-table.png
diff --git a/docs/docs/_static/images/pipeline-metrics-comparison.png b/docs/docs/_static/images/pipeline-metrics-comparison.png
diff --git a/docs/docs/getting-started/changelog.md b/docs/docs/getting-started/changelog.md
@@ -1,5 +1,27 @@
 # Releases
 
+## [2.4.0] - 2022-10-20
+
+### Added
+- **New dataset warning**: Added new class imbalance warnings.
+- **Pipeline Comparison**: Added a new pipeline comparison mode in the pipeline metrics table to compare the metrics on different pipelines.
+- **New Smart Tag Analysis**: Added a new plot where smart tag patterns over classes can be easily examined in one view.
+
+### Changed
+- **Renaming**: Some sections were renamed in the UI, such as:
+  - Dataset Class Distribution Analysis -> Dataset Warnings
+  - Performance Analysis -> Pipeline Metrics by Data Subpopulation
+  - Performance Overview -> Prediction Overview
+- **Proposed actions**: We added a new action, `merge_classes`, and renamed `consider_new_class` to `define_new_class`.
+- **Improved Confusion Matrix**: The order of the classes in the confusion matrix is now smarter: classes where the model gets similarly confused will be closer to one another. The rejection class is always the last row/column in the confusion matrix. A toggle allows the user to keep the original order from the dataset if preferred.
+- **Refactoring**: We improved the `MetricsPerFilter` module (which generates the pipeline metrics by data subpopulation table). It now takes ~5 times less time to compute.
+- **New config fields**: The memory of the dask cluster can now be set to large (12GB) for bigger models. The config can also be in read-only mode, to prevent users from changing its values.
+- **Offline Mode**: Azimuth can now be launched without internet.
+
+### Fixed
+- Fixed an issue related to HuggingFace where filtering on an empty dataset would result in an error.
+
+
 ## [2.3.0] - 2022-08-17
 
 ### Added
@@ -77,7 +99,7 @@
 - New Smart Tags `pipeline_disagreement` and `incorrect_for_all_pipelines` as a first step for
   pipeline comparison. See section "Pipeline Comparison" [here](../key-concepts/smart-tags.md).
 - Links on top words to filter utterances that contain it. See the section "Word
-  Clouds" [here](../user-guide/exploration-space/performance-overview.md).
+  Clouds" [here](../user-guide/exploration-space/prediction-overview.md).
 
 ## [2.0.0] - 2022-04-12
 

diff --git a/docs/docs/key-concepts/proposed-actions.md b/docs/docs/key-concepts/proposed-actions.md
@@ -3,7 +3,7 @@
 In the utterance table or the utterance details, annotations can be added to indicate whether an
 action should be taken for each data sample. The annotations can be exported with
 the dataset from
-the [:material-link: Utterances Table](../user-guide/exploration-space/utterances-table.md).
+the [:material-link: Utterances Table](../user-guide/exploration-space/utterance-table.md).
 
 <figure markdown>
   ![Image title](../_static/images/key-concepts/proposed-actions.png)

diff --git a/docs/docs/user-guide/dataset-warnings.md b/docs/docs/user-guide/dataset-warnings.md
@@ -1,44 +1,39 @@
-# Dataset Class Distribution Analysis
+# Dataset Warnings
 
-A **discrepancy** between the training and evaluation sets can cause problems with a model. For
-example, the model may **not have a representative sample** of examples to train on, making it **
-hard to generalize**
-in production.
-
-Alternatively, you might be measuring the performance of the model on an evaluation set that may **
-not be a good indicator of the performance in production**. Distribution analysis aims to give
-warnings when the training and evaluation sets look too different in some aspect of the data.
+Datasets can suffer from a variety of issues, such as class imbalance, classes with low sample counts, and dataset shift. These warnings help detect some of these issues.
 
-![](../_static/images/dataset-class-distribution-analysis/dataset-warnings-1.png)
-![](../_static/images/dataset-class-distribution-analysis/dataset-warnings-2.png)
-![](../_static/images/dataset-class-distribution-analysis/dataset-warnings-3.png)
+![](../_static/images/dataset-warnings/dataset-warnings-1.png)
+![](../_static/images/dataset-warnings/dataset-warnings-2.png)
+![](../_static/images/dataset-warnings/dataset-warnings-3.png)
+![](../_static/images/dataset-warnings/dataset-warnings-4.png)
 
-## General Warnings
-
-Azimuth performs 2 analyses to assess class size in the training vs evaluation sets.
-
-### Missing samples
+## Missing samples
 
 In this first analysis, the application flags when a class has **fewer than `X`** (default is 20)
 samples in either the training or the evaluation set. The plot helps to visualize the values for
 each class.
 
-### Class Imbalance
+## Class Imbalance
 
 In this second analysis, Azimuth detects class imbalance issues. It raises a flag for all classes
-where the relative difference between the number of samples and the mean in a dataset split is above
+where the relative difference between the number of samples in that class and the mean sample count per class in a dataset split is above
 a certain threshold `Y`. The default is 50%.
 
+## Dataset Shift
+
+A **discrepancy** between the training and evaluation splits can cause problems with a model. For
+example, the model may **not have a representative sample** of examples to train on, making it **generalize poorly**
+in production.
+
+Alternatively, if your evaluation set does not come from the same data distribution as the data in production, measuring model performance on this evaluation set may **not be a good indicator of the performance in production**. Distribution analysis aims to give
+warnings when the training and evaluation sets look too different in some aspect of the data.
+
 ### Representation mismatch
 
-The third analysis flags if a class is **over-represented** in the evaluation set (relative to
+This analysis flags when a class is **over-represented** in the evaluation set (relative to
 other classes) or the training set. If the delta between the percentage of a class in each set is
 above `Z`% (default is 5%), the analysis flags it.
 
-## Syntactic Warnings
-
-Syntactic warnings indicate differences in the syntax of the utterances between each set.
-
 ### Length mismatch
 
 Length mismatch compares the number of **tokens per utterance** in both sets. The application flags

diff --git a/docs/docs/user-guide/exploration-space/index.md b/docs/docs/user-guide/exploration-space/index.md
@@ -2,9 +2,9 @@
 
 The Exploration Space includes the datasets and predictions of your model in an interactive way.
 Explore the utterances and the predictions, spot patterns in errors, and annotate the data to
-trigger further work to improve the model's performance.
+indicate further work to improve the model's predictions.
 
-![Screenshot](../../_static/images/exploration-space/performance-overview.png)
+![Screenshot](../../_static/images/exploration-space/prediction-overview.png)
 
 !!! tip "Access from dashboard"
 
@@ -23,7 +23,7 @@ panel. Both training and evaluation sets can be explored.
     pipelines compare. This space also exists without selecting any pipelines, to perform dataset
     analysis.
 
-### [Performance Overview](performance-overview.md)
+### [Prediction Overview](prediction-overview.md)
 
 * Assess the quality of the metrics for any given subset of the data.
 * Visualize the distribution of the confidence scores, according to prediction outcome.
@@ -33,7 +33,7 @@ panel. Both training and evaluation sets can be explored.
 
 * Visualize the model confusion between each pair of intents.
 
-### [Utterances Table](utterances-table.md)
+### [Utterance Table](utterance-table.md)
 
 * Explore the utterances, with their labels, predictions, and smart tags.
 * Access all utterance details, including the detailed prediction results, the behavioral tests, and
@@ -113,8 +113,8 @@ filters are listed below.
   filters** within a category, respectively `NO_SMART_TAGS` and `NO_ACTION`.
 
 ##### Search
+![Screenshot](../../_static/images/control-panel/filter-search.png){: style="width:400px"}
 
 * Use the search bar to find specific filters.
-  ![Screenshot](../../_static/images/control-panel/filter-search.png){: style="width:400px"}
 
 --8<-- "includes/abbreviations.md"
diff --git a/...exploration-space/performance-overview.md → .../exploration-space/prediction-overview.md b/...exploration-space/performance-overview.md → .../exploration-space/prediction-overview.md
@@ -1,13 +1,13 @@
-# Performance Overview
+# Prediction Overview
 
-The Performance Overview centralizes the **metrics**, the **confidence histogram** and two **word
+The Prediction Overview centralizes the **metrics**, the **confidence histogram** and two **word
 clouds** to show important words from the utterances.
 
-![Screenshot](../../_static/images/exploration-space/performance-overview.png)
+![Screenshot](../../_static/images/exploration-space/prediction-overview.png)
 
 ## Metrics
 
-Assess the quality of the model with different performance metrics. Hover over the information icon
+Assess the quality of the model with different metrics. Hover over the information icon
 to see more information on each metric.
 
 ![Screenshot](../../_static/images/exploration-space/metrics.png)

diff --git a/...ide/exploration-space/utterances-table.md → ...uide/exploration-space/utterance-table.md b/...ide/exploration-space/utterances-table.md → ...uide/exploration-space/utterance-table.md
@@ -1,9 +1,9 @@
-# Utterances Table
+# Utterance Table
 
 The utterance table view contains **all the utterances with their predictions**. The table also
 includes information such as smart tags and proposed actions, if applicable.
 
-![](../../_static/images/exploration-space/utterances-table.png)
+![](../../_static/images/exploration-space/utterance-table.png)
 
 To see **more details on an utterance**, click any row, which will open
 the [:material-link: Utterance Details](utterance-details.md) page.

diff --git a/docs/docs/user-guide/index.md b/docs/docs/user-guide/index.md
@@ -3,7 +3,7 @@
 **Welcome to Azimuth!**
 
 Explore the different analyses and tools of Azimuth using the dashboard. Navigate through the
-different sections to get a deeper understanding of the dataset and the model.
+different sections to get a deeper understanding of the dataset and the pipeline.
 
 !!! info "Use Azimuth with no pipeline, or with multiple pipelines"
 
@@ -33,31 +33,29 @@ The top banner contains useful information and links.
     [**Exploration Space**](exploration-space/index.md) to explore and interact with the utterances
     and the predictions.
 
-## Dataset Class Distribution Analysis
+## Dataset Warnings
 
-The Distribution Analysis section highlights **gaps between the class distributions** of the
-training and the evaluation sets.
+The dataset warnings section highlights issues related to **class size**, **class imbalance** and **dataset shift**, i.e. differences between the data distributions of the training and the evaluation sets.
 
-![Screenshot](../_static/images/dashboard/dataset-class-distribution-analysis.png)
+![Screenshot](../_static/images/dashboard/dataset-warnings.png)
 
 * **Missing samples**: Verify if each intent has sufficient samples in both sets.
+* **Class imbalance**: Flag when some classes suffer from imbalance in either split.
 * **Representation mismatch**: Assess that the representation of each intent is similar in both
   sets.
 * **Length mismatch**: Verify that the utterances' length are similar for each intent in both sets.
 
-Select `View Details` to get
-to [:material-link: Dataset Class Distribution Analysis](dataset-warnings.md).
+Select `View Details` to get to [:material-link: Dataset Warnings](dataset-warnings.md).
 
-## Performance Analysis
+## Pipeline Metrics by Data Subpopulation
 
-The Performance Analysis section summarizes the model performance in terms of the [**prediction
-outcomes**](../key-concepts/outcomes.md) and **metrics** available in Azimuth. Change the value in
-the dropdown :material-arrow-down-drop-circle-outline: to see the metrics broken down per label,
-predicted class, or smart tag families. Use the toggle to alternate between the performance on the
-training set or on the evaluation set.
+This section summarizes the quality of the predictions in terms of the [**prediction
+outcomes**](../key-concepts/outcomes.md) and **metrics** available in Azimuth, for different data subpopulations. Change the value in the dropdown :material-arrow-down-drop-circle-outline: to see the metrics broken down per label, predicted class, or smart tag families. Use the toggle to alternate between the training set or on the evaluation set.
 
 The user can click on any row in the table to get directly to the exploration space with the corresponding filters applied. This allows for further investigation of errors. As an example, clicking on the row with the label `freeze_account` will bring the user to the exploration space with that same filter applied. This works with prediction classes and smart tags too.
 
+Click on `Compare pipelines` to display the table fullscreen and compare all metrics across pipelines, as explained in [Pipeline Metrics Comparison](pipeline-metrics-comparison.md).
+
 !!! tip "Sort the table and hide columns"
 
     :material-sort: Click a column header to sort the values in ascending or descending order.
@@ -68,7 +66,7 @@ The user can click on any row in the table to get directly to the exploration sp
     corresponding column, or multiple ones by selecting 'Show columns'. However, the table columns
     will reappear if the page is refreshed.
 
-![Screenshot](../_static/images/dashboard/performance-analysis.png)
+![Screenshot](../_static/images/dashboard/pipeline-metrics.png)
 
 !!! tip "Go to the exploration space to interact with metrics"
 

diff --git a/docs/docs/user-guide/pipeline-metrics-comparison.md b/docs/docs/user-guide/pipeline-metrics-comparison.md
@@ -0,0 +1,5 @@
+# Pipeline Metrics Comparison
+
+![Screenshot](../_static/images/pipeline-metrics-comparison.png)
+
+This table summarizes the quality of the predictions for different data subpopulations. When selecting a second model next to `Compare Baseline with`, the table will display the metrics for both the baseline and the second model, along with delta columns that show the difference for each metric and subpopulation. All interactions available from the [Dashboard](index.md) are also available here, including the ability to sort and hide columns, and to click on a row.
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -76,13 +76,14 @@ nav:
       - Dashboard:
           - user-guide/index.md
           - user-guide/dataset-warnings.md
+          - user-guide/pipeline-metrics-comparison.md
           - user-guide/behavioral-testing-summary.md
           - user-guide/post-processing-analysis.md
       - Exploration Space:
           - user-guide/exploration-space/index.md
-          - user-guide/exploration-space/performance-overview.md
+          - user-guide/exploration-space/prediction-overview.md
           - user-guide/exploration-space/confusion-matrix.md
-          - user-guide/exploration-space/utterances-table.md
+          - user-guide/exploration-space/utterance-table.md
           - user-guide/exploration-space/utterance-details.md
       - user-guide/settings.md
       - user-guide/custom-utterances.md

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "azimuth"
-version = "2.3.0"
+version = "2.4.0"
 description = "Azimuth provides a unified error analysis experience to data scientists."
 readme = "README.md"
 authors = ["Azimuth team <azimuth-team@servicenow.com>"]

diff --git a/webapp/src/components/Metrics/PerformanceAnalysisTable.tsx b/webapp/src/components/Metrics/PerformanceAnalysisTable.tsx
@@ -172,10 +172,11 @@ const PerformanceAnalysisTable: React.FC<Props> = ({
             basePipeline,
             ...(comparedPipeline !== undefined &&
               comparedPipelineData && {
-                comparedPipeline:
-                  comparedPipelineData.metricsPerFilter[
-                    selectedMetricPerFilterOption
-                  ][index],
+                comparedPipeline: comparedPipelineData.metricsPerFilter[
+                  selectedMetricPerFilterOption
+                ].find(
+                  ({ filterValue }) => filterValue === basePipeline.filterValue
+                ),
               }),
           })
         ),