class paguro.Dataset[VFM: paguro.models.vfm.VFrameModel]

A DataFrame like structure with validation and other extensions.

Constructors

Dataset(...)

Initializes the Dataset class.

Validation + Model

Validation

with_validation(*validators, ...) Self

Add validation to the dataset.

property validation : Validation | None

Retrives the validation that has been added to the dataset.

validate(*validators, ...) Self

Validate the dataset.

Model

property vcol : VFM

Retrieve the dataset model if it is not None.

property model : VFM | None

Retrieve the dataset model.

with_model[U](model: type[U], *, ...) Dataset[U]

Add a model and validation to the dataset.

without_model() Dataset[Any]

Remove the model and validation from the dataset.

collect_model_blueprint(...) str | None

Generate a blueptrint for a model of the datased based on VFrameModel.

Information

with_info(k: str, /, **mapping: Any) Self

Add information to the dataset.

with_name(name: str | None) Self

Add a name to the dataset.

EDA

skim(config: list[tuple] | None = None, *, ...) Collection

Generate a summary of the dataset based on specified configurations.

Export

to_polars() polars.DataFrame
to_dataframe() polars.DataFrame

To Polars DataFrame. Collects if the underlying dataset is a LazyFrame.

to_lazyframe() polars.LazyFrame

To Polars LazyFrame.

Polars Methods

Adapted

Some polars methods have been adapted to manage model/validation/info or to accept paguro’s types as arguments. The computation over the data is still handled with polars.

write_parquet(file: str | Path | IO[bytes], *, ...) None

Write parquet.

lazy() LazyDataset[VFM]

Polars’ .lazy.

group_by(*by, ...) _GroupBy[Dataset]

Polars’ group_by.

group_by_dynamic(index_column: IntoExpr, ...) _GroupBy[Dataset]

Polars’ group_by_dynamic.

rolling(index_column: IntoExpr, *, period, ...) _GroupBy[Dataset]

Polars’ rolling.

join(other: Dataset[U] | polars.DataFrame, ...) Self

Polars’ join.

join_asof(other: Dataset[U] | polars.DataFrame, *, ...) Self

Polars’ join_asof.

join_where(other: Dataset[U] | polars.DataFrame, ...) Self

Polars’ join_where.

merge_sorted(other: Dataset[U] | polars.DataFrame, key: str) Self

Polars’ merge_sorted.

vstack(other: Dataset[VFM] | polars.DataFrame, *, ...) Self

Polars’ vstack.

rename(mapping, ...) Self

Polars’ rename.

Delegated

set_sorted(column: str, *, descending: bool = False) Self

See set_sorted

approx_n_unique() Self

See approx_n_unique

bottom_k(k: int, *, by: IntoExpr | Iterable[IntoExpr], ...) Self

See bottom_k

cast(dtypes, ...) Self

See cast

clear(n: int = 0) Self

See clear

clone() Self

See clone

collect_schema() Schema

See collect_schema

property columns : Any

See columns

corr(**kwargs: Any) Self

See corr

count() Self

See count

describe(...) Self

See describe

drop(*columns, ...) Self

See drop

drop_in_place(name: str) polars.Series

See drop_in_place

drop_nans(...) Self

See drop_nans

drop_nulls(...) Self

See drop_nulls

property dtypes : Any

See dtypes

equals(other: polars.DataFrame, *, null_equal: bool = True) bool

See equals

estimated_size(unit: SizeUnit = 'b') int | float

See estimated_size

explode(columns, ...) Self

See explode

extend(other: polars.DataFrame) Self

See extend

fill_nan(value: Expr | int | float | None) Self

See fill_nan

fill_null(value: Any | Expr | None = None, ...) Self

See fill_null

filter(*predicates, ...) Self

See filter

property flags : Any

See flags

fold(operation) polars.Series

See fold

gather_every(n: int, offset: int = 0) Self

See gather_every

get_column(name: str, *, ...) polars.Series | Any

See get_column

get_column_index(name: str) int

See get_column_index

get_columns() list[TypeAliasForwardRef('polars.Series')]

See get_columns

glimpse(*, max_items_per_column: int = 10, ...) str | None

See glimpse

hash_rows(seed: int = 0, ...) polars.Series

See hash_rows

head(n: int = 5) Self

See head

property height : Any

See height

hstack(columns: list[polars.Series] | polars.DataFrame, ...) Self

See hstack

insert_column(index: int, column: IntoExprColumn) Self

See insert_column

interpolate() Self

See interpolate

is_duplicated() polars.Series

See is_duplicated

is_empty() bool

See is_empty

is_unique() polars.Series

See is_unique

item(row: int | None = None, column: int | str | None = None) Any

See item

iter_columns() Iterator[polars.Series]

See iter_columns

iter_rows(...) Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]

See iter_rows

iter_slices(n_rows: int = 10000) Iterator[polars.DataFrame]

See iter_slices

limit(n: int = 5) Self

See limit

map_columns(column_names, ...) Self

See map_columns

map_rows(function: Callable[[tuple[Any, ...]], Any], ...) Self

See map_rows

match_to_schema(schema: SchemaDict | Schema, *, ...) Self

See match_to_schema

max() Self

See max

max_horizontal() polars.Series

See max_horizontal

mean() Self

See mean

mean_horizontal(*, ignore_nulls: bool = True) polars.Series

See mean_horizontal

median() Self

See median

melt(...) Self

See melt

min() Self

See min

min_horizontal() polars.Series

See min_horizontal

n_chunks(strategy: 'first' | 'all' = 'first') int | list[int]

See n_chunks

n_unique(...) int

See n_unique

null_count() Self

See null_count

partition_by(...) list[polars.DataFrame] | dict[tuple[Any, ...], polars.DataFrame]

See partition_by

pipe(function, ...) T

See pipe

pivot(on, ...) Self

See pivot

property plot : Any

See plot

product() Self

See product

quantile(quantile: float, ...) Self

See quantile

rechunk() Self

See rechunk

remove(*predicates, ...) Self

See remove

replace_column(index: int, column: polars.Series) Self

See replace_column

reverse() Self

See reverse

row(...) tuple[Any, ...] | dict[str, Any]

See row

rows(*, ...) list[tuple[Any, ...]] | list[dict[str, Any]]

See rows

rows_by_key(key, ...) dict[Any, Any]

See rows_by_key

sample(n: int | polars.Series | None = None, *, ...) Self

See sample

property schema : Any

See schema

select(*exprs: IntoExpr | Iterable[IntoExpr], **named_exprs) Self

See select

select_seq(*exprs: IntoExpr | Iterable[IntoExpr], ...) Self

See select_seq

property shape : Any

See shape

shift(n: int = 1, *, fill_value: IntoExpr | None = None) Self

See shift

shrink_to_fit(*, in_place: bool = False) Self

See shrink_to_fit

slice(offset: int, length: int | None = None) Self

See slice

sort(by: IntoExpr | Iterable[IntoExpr], *more_by, ...) Self

See sort

sql(query: str, *, table_name: str = 'self') Self

See sql

std(ddof: int = 1) Self

See std

property style : Any

See style

sum() Self

See sum

sum_horizontal(*, ignore_nulls: bool = True) polars.Series

See sum_horizontal

tail(n: int = 5) Self

See tail

top_k(k: int, *, by: IntoExpr | Iterable[IntoExpr], ...) Self

See top_k

transpose(*, include_header: bool = False, ...) Self

See transpose

unique(...) Self

See unique

unnest(columns, ...) Self

See unnest

unpivot(...) Self

See unpivot

unstack(*, step: int, ...) Self

See unstack

update(other: polars.DataFrame, ...) Self

See update

upsample(time_column: str, *, every: str | timedelta, ...) Self

See upsample

var(ddof: int = 1) Self

See var

property width : Any

See width

with_columns(*exprs: IntoExpr | Iterable[IntoExpr], ...) Self

See with_columns

with_columns_seq(*exprs: IntoExpr | Iterable[IntoExpr], ...) Self

See with_columns_seq

with_row_count(name: str = 'row_nr', offset: int = 0) Self

See with_row_count

with_row_index(name: str = 'index', offset: int = 0) Self

See with_row_index

write_avro(file: str | Path | IO[bytes], ...) None

See write_avro

write_clipboard(*, separator: str = '\t', **kwargs: Any) None

See write_clipboard

write_csv(...) str | None

See write_csv

write_database(table_name: str, connection, ...) int

See write_database

write_delta(target, ...) deltalake.table.TableMerger | None

See write_delta

write_excel(...) Workbook

See write_excel

write_iceberg(target: str | pyiceberg.table.Table, mode) None

See write_iceberg

write_ipc(file, ...) BytesIO | None

See write_ipc

write_ipc_stream(file, ...) BytesIO | None

See write_ipc_stream

write_json(file: IOBase | str | Path | None = None) str | None

See write_json

write_ndjson(...) str | None

See write_ndjson

to_arrow(*, compat_level: CompatLevel | None = None) pa.Table

See to_arrow

to_dict(...) dict[str, TypeAliasForwardRef('polars.Series')] | dict[str, list[Any]]

See to_dict

to_dicts() list[dict[str, Any]]

See to_dicts

to_dummies(...) Self

See to_dummies

to_init_repr(n: int = 1000) str

See to_init_repr

to_jax(...) jax.Array | dict[str, jax.Array]

See to_jax

to_numpy(*, ...) np.ndarray[Any, Any]

See to_numpy

to_pandas(*, ...) pd.DataFrame

See to_pandas

to_series(index: int = 0) polars.Series

See to_series

to_struct(...) polars.Series

See to_struct

to_torch(...) torch.Tensor | dict[str, torch.Tensor] | PolarsDataset

See to_torch