Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First Code (ALPHA) #1

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"python.formatting.provider": "black"
}
7 changes: 7 additions & 0 deletions env.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: typed_polars
channels:
- conda-forge
dependencies:
- python >=3.7
- polars >=0.15.0
- pytest >= 7.2.0
Empty file added tests/__init__.py
Empty file.
19 changes: 19 additions & 0 deletions tests/test_simple_df.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import polars as pl
import typed_polars as pt


def test_only_types():
df = pl.DataFrame({"name": ["Hampus", "Dennis", "Noah"], "age": [28, 26, 20]})

schema = pt.DataFrameSchema(
{"name": pt.Column(pl.Utf8), "age": pt.Column(pl.Int64)}
)

result = schema.validate(df)

assert result.passed

df = pl.DataFrame({"age": ["Hampus", "Dennis", "Noah"], "name": [28, 26, 20]})
result = schema.validate(df)

assert not result.passed
4 changes: 4 additions & 0 deletions typed_polars/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from typed_polars.schema import DataFrameSchema
from typed_polars.column import Column, CastColumn

import typed_polars.checks as Checks
1 change: 1 addition & 0 deletions typed_polars/checks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# from typed_polars.checks.numerical import le, lt, eq, gt, ge
33 changes: 33 additions & 0 deletions typed_polars/checks/check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from dataclasses import dataclass

import polars as pl


# https://pandera.readthedocs.io/en/stable/
# https://aeturrell.github.io/coding-for-economists/data-advanced.html
# https://pandera.readthedocs.io/en/stable/data_format_conversion.html


@dataclass
class ValidationResult:
msg: str
passed: bool

@staticmethod
def combine_all(results: list[ValidationResult]) -> ValidationResult:
return ValidationResult(
msg="\n".join([m.msg for m in results if m is not None]),
passed=all([m.passed for m in results if m is not None]),
)


class Check(ABC):
@abstractmethod
def validate_column(self, series: pl.Series) -> ValidationResult:
pass

def validate_frame_column(self, df: pl.DataFrame, column: str) -> ValidationResult:
return self.validate_column(df[column])
24 changes: 24 additions & 0 deletions typed_polars/checks/numerical.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from datetime import date, datetime
from typing import Callable

import polars as pl

def ge(than: float | int | date | datetime) -> Callable[[str], pl.Expr]:
return lambda x: pl.col(x) > than



# @dataclass
# class MustBeGreater(ColumnValidator):
# than: float | int | date | datetime
# inclusive: bool
#
# def validate_frame_column(self, df: pl.DataFrame, column: str):
# col = pl.col(column)
#
# series = df[[column]].filter(col >= self.than if self.inclusive else col > self.than)
#
# return ValidationResult(
# f"Number elements outside bounds {self}: {len(df) - len(series)}",
# len(series) == len(df)
# )
37 changes: 37 additions & 0 deletions typed_polars/column.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from __future__ import annotations

from dataclasses import dataclass

import polars as pl

from typed_polars import utils
from typed_polars.checks.check import Check, ValidationResult


@dataclass
class Column:
type_: pl.DataType
nullable: bool = False

checks: list[Check] | Check | None = None

def check(self, df: pl.DataFrame, column: str) -> ValidationResult:
checks = utils.as_list(self.checks)
results = []

# TODO: type check

for check in checks:
results.append(check.validate_frame_column(df, column))

pass


@dataclass
class CastColumn(Column):
type_: pl.DataType
nullable: bool = False

allow_drop_null_on_cast: bool = False

checks: list[Check] | Check | None = None
21 changes: 21 additions & 0 deletions typed_polars/schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import polars as pl
from dataclasses import dataclass

from typed_polars.checks.check import ValidationResult
from typed_polars.column import Column


@dataclass
class DataFrameSchema:
schema: dict[str, Column]

def validate(self, df: pl.DataFrame) -> ValidationResult:
results = []
for column_name, column in self.schema.items():
column: Column
result = column.check(df, column_name)
results.append(result)

result: ValidationResult = ValidationResult.combine_all(results)

return result
13 changes: 13 additions & 0 deletions typed_polars/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from typing import TypeVar

T = TypeVar("T")


def as_list(data: list[T] | T | None) -> list[T]:
match data:
case list():
return data
case None:
return []
case _:
return [data]