Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Export products as dataframe #102

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions nisystemlink/clients/product/utilities/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from ._dataframe_utilities import convert_products_to_dataframe
from ._file_utilities import get_products_linked_to_file

# flake8: noqa
27 changes: 27 additions & 0 deletions nisystemlink/clients/product/utilities/_dataframe_utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from typing import List

import pandas as pd
from nisystemlink.clients.product.models import Product
from pandas import DataFrame


def convert_products_to_dataframe(products: List[Product]) -> DataFrame:
"""Converts a list of products into a normalized dataframe.

Args:
products (List[Product]): A list of products

Returns:
DataFrame:
- A Pandas DataFrame containing the product data. The DataFrame would consist of all the
fields in the input products.
- A new column would be created for unique properties across all products. The property
columns would be named in the format `properties.property_name`.
"""
products_dict_representation = [product.dict() for product in products]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dict has an exclude_none option that would apply to this use since we're just going to drop those columns anyway. If we don't add them to the dict version of the object then that could save some fraction of time to checking which columns are empty. If that is reliable be might be able to drop the dropna call altogether, which is probably a bigger win. Or maybe pandas is smart and can do this efficiently regardless. This SO post shows how to do a quick performance test to see if it makes any difference. It would probably be more significant on models that have more fields like results.

normalized_products_dataframe = pd.json_normalize(
products_dict_representation, sep="."
)
normalized_products_dataframe.dropna(axis="columns", how="all", inplace=True)

return normalized_products_dataframe
328 changes: 313 additions & 15 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ requests = "^2.28.1"
uplink = "^0.9.7"
pydantic = "^1.10.2"
pyyaml = "^6.0.1"
pandas = "^2.1.0"

[tool.poetry.group.dev.dependencies]
black = ">=22.10,<25.0"
Expand Down
1 change: 1 addition & 0 deletions tests/unit/__init__.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets not add this unit folder and stick with the existing file layout. That would mean putting your tests in a product folder alongside core

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# flake8: noqa
1 change: 1 addition & 0 deletions tests/unit/product/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# flake8: noqa
123 changes: 123 additions & 0 deletions tests/unit/product/test_product_dataframe_utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
from datetime import datetime
from typing import List

import pandas as pd
import pytest
from nisystemlink.clients.product.models import Product
from nisystemlink.clients.product.utilities import convert_products_to_dataframe
from pandas import DataFrame


@pytest.fixture
def mock_products_data() -> List[Product]:
"""Fixture to return a mock product data."""
product1 = Product(
id="5ffb2bf6771fa11e877838dd1",
part_number="p1",
name="product_1",
family="product_family",
updated_at=datetime(2024, 2, 2, 14, 22, 4, 625155),
file_ids=["file11", "file12"],
keywords=["keyword11", "keyword12"],
properties={"property11": "property11_value", "property12": "property12_value"},
workspace="5ffb2bf6771fa11e877838dd0",
)
product2 = Product(
id="5ffb2bf6771fa11e877838dd2",
part_number="p2",
name="product_2",
family="product_family",
updated_at=datetime(2024, 2, 2, 14, 22, 4, 625455),
file_ids=["file21", "file22"],
keywords=["keyword21", "keyword22"],
properties={"property21": "property21_value"},
workspace="5ffb2bf6771fa11e877838dd0",
)

return [product1, product2]


@pytest.fixture
def expected_products_dataframe(mock_products_data: List[Product]) -> DataFrame:
"""Fixture to return the expected DataFrame based on the mock product data."""
restructured_mock_products = []

for product in mock_products_data:
properties = (
{f"properties.{key}": value for key, value in product.properties.items()}
if product.properties
else {}
)
restructured_product = {
"id": product.id,
"part_number": product.part_number,
"name": product.name,
"family": product.family,
"updated_at": product.updated_at,
"file_ids": product.file_ids,
"keywords": product.keywords,
"workspace": product.workspace,
**properties,
}
restructured_mock_products.append(restructured_product)

return pd.json_normalize(restructured_mock_products)


@pytest.fixture
def empty_products_data() -> List:
"""Fixture to return an empty list of products."""
return []


@pytest.mark.enterprise
@pytest.mark.unit
class TestProductDataframeUtilities:
def test__convert_products_to_dataframe__with_complete_data(
self, mock_products_data: List[Product], expected_products_dataframe: DataFrame
):
"""Test normal case with valid product data."""
products_dataframe = convert_products_to_dataframe(mock_products_data)

assert not products_dataframe.empty
assert (
products_dataframe.columns.to_list()
== expected_products_dataframe.columns.to_list()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In addition to verifying the column names match the expected, would you also check the column data types? In particular that the dates are dates and arrays are arrays

)
pd.testing.assert_frame_equal(
products_dataframe, expected_products_dataframe, check_dtype=True
)

def test__convert_products_to_dataframe__with_empty_data(
self, empty_products_data: List
):
"""Test case when the input products data is empty."""
products_dataframe = convert_products_to_dataframe(empty_products_data)

assert products_dataframe.empty

def test__convert_products_to_dataframe__with_missing_fields(
self, mock_products_data: List[Product], expected_products_dataframe: DataFrame
):
"""Test case when some fields in product data are missing."""
products = mock_products_data
for product in products:
product.keywords = None
product.properties = None

products_dataframe = convert_products_to_dataframe(products)
expected_products_dataframe = expected_products_dataframe.drop(
columns=expected_products_dataframe.filter(
like="properties"
).columns.to_list()
+ ["keywords"]
)

assert not products_dataframe.empty
assert (
products_dataframe.columns.to_list()
== expected_products_dataframe.columns.to_list()
)
pd.testing.assert_frame_equal(
products_dataframe, expected_products_dataframe, check_dtype=True
)