Feature: Set PREPARED_INGREDIENT flag for amounts that occur in phras…

…e such as "to yield <amount>"
strangetom · Jan 16, 2025 · b17714e · b17714e
1 parent a611c32
commit b17714e
Show file tree

Hide file tree

Showing 8 changed files with 118 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -48,13 +48,13 @@ The model has the following accuracy on a test data set of 20% of the total data
 
 ```
 Sentence-level results:
-	Accuracy: 94.71%
+	Accuracy: 94.62%
 
 Word-level results:
-	Accuracy 97.78%
-	Precision (micro) 97.74%
-	Recall (micro) 97.78%
-	F1 score (micro) 97.75%
+	Accuracy 97.79%
+	Precision (micro) 97.75%
+	Recall (micro) 97.79%
+	F1 score (micro) 97.76%
 ```
 
 ## Development

diff --git a/ingredient_parser/dataclasses.py b/ingredient_parser/dataclasses.py
@@ -92,7 +92,11 @@ class CompositeIngredientAmount:
         Confidence of parsed ingredient amount, between 0 and 1.
         This is the average confidence of all tokens that contribute to this object.
     starting_index : int
-        Index of token in sentence that starts this amount
+        Index of token in sentence that starts this amount.
+    PREPARED_INGREDIENT : bool, optional
+        When True, indicates the amount applies to the prepared ingredient.
+        When False, indicates the amount applies to the ingredient before preparation.
+        Default is False.
     """
 
     amounts: list[IngredientAmount]
@@ -101,6 +105,7 @@ class CompositeIngredientAmount:
     text: str = field(init=False)
     confidence: float = field(init=False)
     starting_index: int = field(init=False)
+    PREPARED_INGREDIENT: bool = field(init=False)
 
     def __post_init__(self):
         """On dataclass instantiation, generate the text field."""
@@ -278,6 +283,10 @@ def __post_init__(self):
             ):
                 amount.PREPARED_INGREDIENT = True
 
+                if isinstance(amount, CompositeIngredientAmount):
+                    for composite_amount in amount.amounts:
+                        composite_amount.PREPARED_INGREDIENT = True
+
 
 @dataclass
 class ParserDebugInfo:

diff --git a/ingredient_parser/en/ModelCard.en.md b/ingredient_parser/en/ModelCard.en.md
@@ -124,7 +124,7 @@ The model has the following performance metrics:
 
 | Word level accuracy | Sentence level accuracy |
 | ------------------- | ----------------------- |
-| 97.78 ± 0.23%       | 94.71 ± 0.37%           |
+| 97.79 ± 0.23%       | 94.62 ± 0.48%           |
 
 These metrics were determined by executing 20 training/evaluation cycles and calculating the mean and standard deviation for the two metrics across all cycles. The uncertainty values provided represent the 99.7% confidence bounds (i.e. 3x standard deviation). The uncertainty is due to the randomisation of the selection of training and evaluation data whenever the model is trained.
 

diff --git a/ingredient_parser/en/_constants.py b/ingredient_parser/en/_constants.py
@@ -400,6 +400,11 @@
 ]
 # Tokens that indicate an amount is singular
 SINGULAR_TOKENS = ["each"]
+# Tokens that indicate an amount refers to the prepared ingredient
+PREPARED_INGREDIENT_TOKENS = [
+    ["to", "yield"],
+    ["to", "make"],
+]
 
 # List of sets, where each set contains the synonyms that represent the same unit.
 UNIT_SYNONYMS = [

diff --git a/ingredient_parser/en/_utils.py b/ingredient_parser/en/_utils.py
@@ -418,6 +418,7 @@ def ingredient_amount_factory(
     starting_index: int,
     APPROXIMATE: bool = False,
     SINGULAR: bool = False,
+    PREPARED_INGREDIENT: bool = False,
     string_units: bool = False,
     imperial_units: bool = False,
     quantity_fractions: bool = False,
@@ -449,6 +450,10 @@ def ingredient_amount_factory(
     SINGULAR : bool, optional
         When True, indicates if the amount refers to a singular item of the ingredient.
         Default is False.
+    PREPARED_INGREDIENT : bool, optional
+        When True, indicates the amount applies to the prepared ingredient.
+        When False, indicates the amount applies to the ingredient before preparation.
+        Default is False.
     string_units : bool, optional
         If True, return all IngredientAmount units as strings.
         If False, convert IngredientAmount units to pint.Unit objects where possible.
@@ -529,4 +534,5 @@ def ingredient_amount_factory(
         SINGULAR=SINGULAR,
         RANGE=RANGE,
         MULTIPLIER=MULTIPLIER,
+        PREPARED_INGREDIENT=PREPARED_INGREDIENT,
     )
diff --git a/ingredient_parser/en/model.en.crfsuite b/ingredient_parser/en/model.en.crfsuite
diff --git a/ingredient_parser/en/postprocess.py b/ingredient_parser/en/postprocess.py
@@ -17,6 +17,7 @@
 )
 from ._constants import (
     APPROXIMATE_TOKENS,
+    PREPARED_INGREDIENT_TOKENS,
     SINGULAR_TOKENS,
     STOP_WORDS,
     STRING_NUMBERS_REGEXES,
@@ -55,6 +56,10 @@ class _PartialIngredientAmount:
     SINGULAR : bool, optional
         When True, indicates if the amount refers to a singular item of the ingredient.
         Default is False.
+    PREPARED_INGREDIENT : bool, optional
+        When True, indicates the amount applies to the prepared ingredient.
+        When False, indicates the amount applies to the ingredient before preparation.
+        Default is False.
     """
 
     quantity: str
@@ -64,6 +69,7 @@ class _PartialIngredientAmount:
     related_to_previous: bool = False
     APPROXIMATE: bool = False
     SINGULAR: bool = False
+    PREPARED_INGREDIENT = False
 
 
 class PostProcessor:
@@ -1181,7 +1187,7 @@ def _fallback_pattern(
             if label == "QTY":
                 # Whenever we come across a new QTY, create new IngredientAmount,
                 # unless the token is "dozen" and the previous label was QTY, in which
-                # case we combine modify the quantity of the previous amount.
+                # case we modify the quantity of the previous amount.
                 if token == "dozen" and labels[i - 1] == "QTY":
                     amounts[-1].quantity = amounts[-1].quantity + " dozen"
                     amounts[-1].confidence.append(score)
@@ -1224,7 +1230,11 @@ def _fallback_pattern(
                 amounts[-1].APPROXIMATE = True
                 amounts[-1].SINGULAR = True
 
-        # Set APPROXIMATE and SINGULAR flags to be the same for all related amounts
+            if self._is_prepared(i, tokens, labels, idx):
+                amounts[-1].PREPARED_INGREDIENT = True
+
+        # Set APPROXIMATE, SINGULAR and PREPARED_INGREDIENT flags to be the same for all
+        # related amounts.
         amounts = self._distribute_related_flags(amounts)
 
         # Loop through amounts list to fix unit and confidence
@@ -1246,6 +1256,7 @@ def _fallback_pattern(
                     starting_index=amount.starting_index,
                     APPROXIMATE=amount.APPROXIMATE,
                     SINGULAR=amount.SINGULAR,
+                    PREPARED_INGREDIENT=amount.PREPARED_INGREDIENT,
                     string_units=self.string_units,
                     imperial_units=self.imperial_units,
                     quantity_fractions=self.quantity_fractions,
@@ -1435,6 +1446,80 @@ def _is_singular_and_approximate(
 
         return False
 
+    def _is_prepared(
+        self, i: int, tokens: list[str], labels: list[str], idx: list[int]
+    ) -> bool:
+        """Return True is token at current index refers to the prepared ingredient.
+
+        This is determined by the token label being QTY and the previous tokens being in
+        a list of prepared tokens.
+        If the QTY is preceded by a token in APPROXIMATE_TOKENS, then the tokens prior
+        to that are checked for matches against the prepared tokens list.
+
+        If returning True, also add index of tokens from prepared token list to
+        self.consumed list.
+
+        Parameters
+        ----------
+        i : int
+            Index of current token
+        tokens : list[str]
+            List of all tokens
+        labels : list[str]
+            List of all token labels
+        idx : list[int]
+            List of indices of the tokens/labels/scores in the full tokenized sentence
+
+        Returns
+        -------
+        bool
+            True if current token is approximate
+
+        Examples
+        --------
+        >>> p = PostProcessor("", [], [], [])
+        >>> p._is_approximate(
+            2,
+            ["to", "yield", "2", "cups"],
+            ["COMMENT", "COMMENT", "QTY", "UNIT"],
+            [0, 1, 2, 3]
+        )
+        True
+
+        >>> p = PostProcessor("", [], [], [])
+        >>> p._is_approximate(
+            2,
+            ["to", "make", "about", "250", "g"],
+            ["COMMENT", "COMMENT, "COMMENT", "QTY", "UNIT"],
+            [0, 1, 2, 3, 4]
+        )
+        True
+        """
+        # All PREPARED_INGREDIENT_TOKENS have length 2, so cannot be prepared if i < 2.
+        if i < 2:
+            return False
+
+        if labels[i] != "QTY":
+            return False
+
+        for pattern in PREPARED_INGREDIENT_TOKENS:
+            if [t.lower() for t in tokens[i - 2 : i]] == pattern:
+                # Mark i - 1 and i - 2 elements as consumed
+                self.consumed.append(idx[i - 1])
+                self.consumed.append(idx[i - 2])
+                return True
+            elif (
+                i > 2
+                and tokens[i - 1] in APPROXIMATE_TOKENS
+                and [t.lower() for t in tokens[i - 3 : i - 1]] == pattern
+            ):
+                # Mark i - 2 and i - 3 elements as consumed
+                self.consumed.append(idx[i - 2])
+                self.consumed.append(idx[i - 3])
+                return True
+
+        return False
+
     def _distribute_related_flags(
         self, amounts: list[_PartialIngredientAmount]
     ) -> list[_PartialIngredientAmount]:
@@ -1468,5 +1553,9 @@ def _distribute_related_flags(
                 for am in group:
                     am.SINGULAR = True
 
+            if any(am.PREPARED_INGREDIENT for am in group):
+                for am in group:
+                    am.PREPARED_INGREDIENT = True
+
         # Flatten list for return
         return list(chain.from_iterable(grouped))
diff --git a/train/data/training.sqlite3 b/train/data/training.sqlite3