Skip to content

Commit

Permalink
Feature: Set PREPARED_INGREDIENT flag for amounts that occur in phras…
Browse files Browse the repository at this point in the history
…e such as "to yield <amount>"
  • Loading branch information
strangetom committed Jan 16, 2025
1 parent a611c32 commit b17714e
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 9 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,13 @@ The model has the following accuracy on a test data set of 20% of the total data

```
Sentence-level results:
Accuracy: 94.71%
Accuracy: 94.62%
Word-level results:
Accuracy 97.78%
Precision (micro) 97.74%
Recall (micro) 97.78%
F1 score (micro) 97.75%
Accuracy 97.79%
Precision (micro) 97.75%
Recall (micro) 97.79%
F1 score (micro) 97.76%
```

## Development
Expand Down
11 changes: 10 additions & 1 deletion ingredient_parser/dataclasses.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,11 @@ class CompositeIngredientAmount:
Confidence of parsed ingredient amount, between 0 and 1.
This is the average confidence of all tokens that contribute to this object.
starting_index : int
Index of token in sentence that starts this amount
Index of token in sentence that starts this amount.
PREPARED_INGREDIENT : bool, optional
When True, indicates the amount applies to the prepared ingredient.
When False, indicates the amount applies to the ingredient before preparation.
Default is False.
"""

amounts: list[IngredientAmount]
Expand All @@ -101,6 +105,7 @@ class CompositeIngredientAmount:
text: str = field(init=False)
confidence: float = field(init=False)
starting_index: int = field(init=False)
PREPARED_INGREDIENT: bool = field(init=False)

def __post_init__(self):
"""On dataclass instantiation, generate the text field."""
Expand Down Expand Up @@ -278,6 +283,10 @@ def __post_init__(self):
):
amount.PREPARED_INGREDIENT = True

if isinstance(amount, CompositeIngredientAmount):
for composite_amount in amount.amounts:
composite_amount.PREPARED_INGREDIENT = True


@dataclass
class ParserDebugInfo:
Expand Down
2 changes: 1 addition & 1 deletion ingredient_parser/en/ModelCard.en.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ The model has the following performance metrics:

| Word level accuracy | Sentence level accuracy |
| ------------------- | ----------------------- |
| 97.78 ± 0.23% | 94.71 ± 0.37% |
| 97.79 ± 0.23% | 94.62 ± 0.48% |

These metrics were determined by executing 20 training/evaluation cycles and calculating the mean and standard deviation for the two metrics across all cycles. The uncertainty values provided represent the 99.7% confidence bounds (i.e. 3x standard deviation). The uncertainty is due to the randomisation of the selection of training and evaluation data whenever the model is trained.

Expand Down
5 changes: 5 additions & 0 deletions ingredient_parser/en/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,11 @@
]
# Tokens that indicate an amount is singular
SINGULAR_TOKENS = ["each"]
# Tokens that indicate an amount refers to the prepared ingredient
PREPARED_INGREDIENT_TOKENS = [
["to", "yield"],
["to", "make"],
]

# List of sets, where each set contains the synonyms that represent the same unit.
UNIT_SYNONYMS = [
Expand Down
6 changes: 6 additions & 0 deletions ingredient_parser/en/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,7 @@ def ingredient_amount_factory(
starting_index: int,
APPROXIMATE: bool = False,
SINGULAR: bool = False,
PREPARED_INGREDIENT: bool = False,
string_units: bool = False,
imperial_units: bool = False,
quantity_fractions: bool = False,
Expand Down Expand Up @@ -449,6 +450,10 @@ def ingredient_amount_factory(
SINGULAR : bool, optional
When True, indicates if the amount refers to a singular item of the ingredient.
Default is False.
PREPARED_INGREDIENT : bool, optional
When True, indicates the amount applies to the prepared ingredient.
When False, indicates the amount applies to the ingredient before preparation.
Default is False.
string_units : bool, optional
If True, return all IngredientAmount units as strings.
If False, convert IngredientAmount units to pint.Unit objects where possible.
Expand Down Expand Up @@ -529,4 +534,5 @@ def ingredient_amount_factory(
SINGULAR=SINGULAR,
RANGE=RANGE,
MULTIPLIER=MULTIPLIER,
PREPARED_INGREDIENT=PREPARED_INGREDIENT,
)
Binary file modified ingredient_parser/en/model.en.crfsuite
Binary file not shown.
93 changes: 91 additions & 2 deletions ingredient_parser/en/postprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
)
from ._constants import (
APPROXIMATE_TOKENS,
PREPARED_INGREDIENT_TOKENS,
SINGULAR_TOKENS,
STOP_WORDS,
STRING_NUMBERS_REGEXES,
Expand Down Expand Up @@ -55,6 +56,10 @@ class _PartialIngredientAmount:
SINGULAR : bool, optional
When True, indicates if the amount refers to a singular item of the ingredient.
Default is False.
PREPARED_INGREDIENT : bool, optional
When True, indicates the amount applies to the prepared ingredient.
When False, indicates the amount applies to the ingredient before preparation.
Default is False.
"""

quantity: str
Expand All @@ -64,6 +69,7 @@ class _PartialIngredientAmount:
related_to_previous: bool = False
APPROXIMATE: bool = False
SINGULAR: bool = False
PREPARED_INGREDIENT = False


class PostProcessor:
Expand Down Expand Up @@ -1181,7 +1187,7 @@ def _fallback_pattern(
if label == "QTY":
# Whenever we come across a new QTY, create new IngredientAmount,
# unless the token is "dozen" and the previous label was QTY, in which
# case we combine modify the quantity of the previous amount.
# case we modify the quantity of the previous amount.
if token == "dozen" and labels[i - 1] == "QTY":
amounts[-1].quantity = amounts[-1].quantity + " dozen"
amounts[-1].confidence.append(score)
Expand Down Expand Up @@ -1224,7 +1230,11 @@ def _fallback_pattern(
amounts[-1].APPROXIMATE = True
amounts[-1].SINGULAR = True

# Set APPROXIMATE and SINGULAR flags to be the same for all related amounts
if self._is_prepared(i, tokens, labels, idx):
amounts[-1].PREPARED_INGREDIENT = True

# Set APPROXIMATE, SINGULAR and PREPARED_INGREDIENT flags to be the same for all
# related amounts.
amounts = self._distribute_related_flags(amounts)

# Loop through amounts list to fix unit and confidence
Expand All @@ -1246,6 +1256,7 @@ def _fallback_pattern(
starting_index=amount.starting_index,
APPROXIMATE=amount.APPROXIMATE,
SINGULAR=amount.SINGULAR,
PREPARED_INGREDIENT=amount.PREPARED_INGREDIENT,
string_units=self.string_units,
imperial_units=self.imperial_units,
quantity_fractions=self.quantity_fractions,
Expand Down Expand Up @@ -1435,6 +1446,80 @@ def _is_singular_and_approximate(

return False

def _is_prepared(
self, i: int, tokens: list[str], labels: list[str], idx: list[int]
) -> bool:
"""Return True is token at current index refers to the prepared ingredient.
This is determined by the token label being QTY and the previous tokens being in
a list of prepared tokens.
If the QTY is preceded by a token in APPROXIMATE_TOKENS, then the tokens prior
to that are checked for matches against the prepared tokens list.
If returning True, also add index of tokens from prepared token list to
self.consumed list.
Parameters
----------
i : int
Index of current token
tokens : list[str]
List of all tokens
labels : list[str]
List of all token labels
idx : list[int]
List of indices of the tokens/labels/scores in the full tokenized sentence
Returns
-------
bool
True if current token is approximate
Examples
--------
>>> p = PostProcessor("", [], [], [])
>>> p._is_approximate(
2,
["to", "yield", "2", "cups"],
["COMMENT", "COMMENT", "QTY", "UNIT"],
[0, 1, 2, 3]
)
True
>>> p = PostProcessor("", [], [], [])
>>> p._is_approximate(
2,
["to", "make", "about", "250", "g"],
["COMMENT", "COMMENT, "COMMENT", "QTY", "UNIT"],
[0, 1, 2, 3, 4]
)
True
"""
# All PREPARED_INGREDIENT_TOKENS have length 2, so cannot be prepared if i < 2.
if i < 2:
return False

if labels[i] != "QTY":
return False

for pattern in PREPARED_INGREDIENT_TOKENS:
if [t.lower() for t in tokens[i - 2 : i]] == pattern:
# Mark i - 1 and i - 2 elements as consumed
self.consumed.append(idx[i - 1])
self.consumed.append(idx[i - 2])
return True
elif (
i > 2
and tokens[i - 1] in APPROXIMATE_TOKENS
and [t.lower() for t in tokens[i - 3 : i - 1]] == pattern
):
# Mark i - 2 and i - 3 elements as consumed
self.consumed.append(idx[i - 2])
self.consumed.append(idx[i - 3])
return True

return False

def _distribute_related_flags(
self, amounts: list[_PartialIngredientAmount]
) -> list[_PartialIngredientAmount]:
Expand Down Expand Up @@ -1468,5 +1553,9 @@ def _distribute_related_flags(
for am in group:
am.SINGULAR = True

if any(am.PREPARED_INGREDIENT for am in group):
for am in group:
am.PREPARED_INGREDIENT = True

# Flatten list for return
return list(chain.from_iterable(grouped))
Binary file modified train/data/training.sqlite3
Binary file not shown.

0 comments on commit b17714e

Please sign in to comment.