from __future__ import annotations
import contextlib
import warnings
from datetime import date, datetime, time, timedelta
from typing import TYPE_CHECKING, Any, Callable, Iterable, Sequence, overload
import polars._reexport as pl
from polars.datatypes import (
DTYPE_TEMPORAL_UNITS,
Date,
Datetime,
Duration,
Int32,
Int64,
Struct,
Time,
UInt32,
is_polars_dtype,
py_type_to_dtype,
)
from polars.dependencies import _check_for_numpy
from polars.dependencies import numpy as np
from polars.utils._parse_expr_input import expr_to_lit_or_expr, selection_to_pyexpr_list
from polars.utils._wrap import wrap_df, wrap_expr
from polars.utils.convert import (
_datetime_to_pl_timestamp,
_time_to_pl_time,
_timedelta_to_pl_timedelta,
)
from polars.utils.decorators import deprecated_alias
from polars.utils.various import find_stacklevel
with contextlib.suppress(ImportError): # Module not available when building docs
from polars.polars import arange as _arange
from polars.polars import arg_sort_by as _arg_sort_by
from polars.polars import arg_where as _arg_where
from polars.polars import as_struct as _as_struct
from polars.polars import coalesce as _coalesce
from polars.polars import col as _col
from polars.polars import collect_all as _collect_all
from polars.polars import cols as _cols
from polars.polars import concat_list as _concat_list
from polars.polars import concat_str as _concat_str
from polars.polars import count as _count
from polars.polars import cov as _cov
from polars.polars import cumfold as _cumfold
from polars.polars import cumreduce as _cumreduce
from polars.polars import datetime as _datetime
from polars.polars import dtype_cols as _dtype_cols
from polars.polars import duration as _duration
from polars.polars import first as _first
from polars.polars import fold as _fold
from polars.polars import last as _last
from polars.polars import lit as _lit
from polars.polars import map_mul as _map_mul
from polars.polars import max_exprs as _max_exprs
from polars.polars import min_exprs as _min_exprs
from polars.polars import pearson_corr as _pearson_corr
from polars.polars import reduce as _reduce
from polars.polars import repeat as _repeat
from polars.polars import spearman_rank_corr as _spearman_rank_corr
from polars.polars import sum_exprs as _sum_exprs
if TYPE_CHECKING:
import sys
from polars.dataframe import DataFrame
from polars.expr.expr import Expr
from polars.lazyframe import LazyFrame
from polars.series import Series
from polars.type_aliases import (
CorrelationMethod,
EpochTimeUnit,
IntoExpr,
PolarsDataType,
PythonLiteral,
RollingInterpolationMethod,
SchemaDict,
TimeUnit,
)
if sys.version_info >= (3, 8):
from typing import Literal
else:
from typing_extensions import Literal
[docs]def col(
name: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType],
*more_names: str | PolarsDataType,
) -> Expr:
"""
Return an expression representing column(s) in a dataframe.
Parameters
----------
name
The name or datatype of the column(s) to represent. Accepts regular expression
input. Regular expressions should start with ``^`` and end with ``$``.
*more_names
Additional names or datatypes of columns to represent, specified as positional
arguments.
Examples
--------
Pass a single column name to represent that column.
>>> df = pl.DataFrame(
... {
... "ham": [1, 2, 3],
... "hamburger": [11, 22, 33],
... "foo": [3, 2, 1],
... "bar": ["a", "b", "c"],
... }
... )
>>> df.select(pl.col("foo"))
shape: (3, 1)
┌─────┐
│ foo │
│ --- │
│ i64 │
╞═════╡
│ 3 │
│ 2 │
│ 1 │
└─────┘
Use the wildcard ``*`` to represent all columns.
>>> df.select(pl.col("*"))
shape: (3, 4)
┌─────┬───────────┬─────┬─────┐
│ ham ┆ hamburger ┆ foo ┆ bar │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ str │
╞═════╪═══════════╪═════╪═════╡
│ 1 ┆ 11 ┆ 3 ┆ a │
│ 2 ┆ 22 ┆ 2 ┆ b │
│ 3 ┆ 33 ┆ 1 ┆ c │
└─────┴───────────┴─────┴─────┘
>>> df.select(pl.col("*").exclude("ham"))
shape: (3, 3)
┌───────────┬─────┬─────┐
│ hamburger ┆ foo ┆ bar │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞═══════════╪═════╪═════╡
│ 11 ┆ 3 ┆ a │
│ 22 ┆ 2 ┆ b │
│ 33 ┆ 1 ┆ c │
└───────────┴─────┴─────┘
Regular expression input is supported.
>>> df.select(pl.col("^ham.*$"))
shape: (3, 2)
┌─────┬───────────┐
│ ham ┆ hamburger │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═══════════╡
│ 1 ┆ 11 │
│ 2 ┆ 22 │
│ 3 ┆ 33 │
└─────┴───────────┘
Multiple columns can be represented by passing a list of names.
>>> df.select(pl.col(["hamburger", "foo"]))
shape: (3, 2)
┌───────────┬─────┐
│ hamburger ┆ foo │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═══════════╪═════╡
│ 11 ┆ 3 │
│ 22 ┆ 2 │
│ 33 ┆ 1 │
└───────────┴─────┘
Or use positional arguments to represent multiple columns in the same way.
>>> df.select(pl.col("hamburger", "foo"))
shape: (3, 2)
┌───────────┬─────┐
│ hamburger ┆ foo │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═══════════╪═════╡
│ 11 ┆ 3 │
│ 22 ┆ 2 │
│ 33 ┆ 1 │
└───────────┴─────┘
Easily select all columns that match a certain data type by passing that datatype.
>>> df.select(pl.col(pl.Utf8))
shape: (3, 1)
┌─────┐
│ bar │
│ --- │
│ str │
╞═════╡
│ a │
│ b │
│ c │
└─────┘
>>> df.select(pl.col(pl.Int64, pl.Float64))
shape: (3, 3)
┌─────┬───────────┬─────┐
│ ham ┆ hamburger ┆ foo │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═══════════╪═════╡
│ 1 ┆ 11 ┆ 3 │
│ 2 ┆ 22 ┆ 2 │
│ 3 ┆ 33 ┆ 1 │
└─────┴───────────┴─────┘
"""
if more_names:
if isinstance(name, str):
names_str = [name]
names_str.extend(more_names) # type: ignore[arg-type]
return wrap_expr(_cols(names_str))
elif is_polars_dtype(name):
dtypes = [name]
dtypes.extend(more_names)
return wrap_expr(_dtype_cols(dtypes))
else:
raise TypeError(
f"Invalid input for `col`. Expected `str` or `DataType`, got {type(name)!r}"
)
if isinstance(name, str):
return wrap_expr(_col(name))
elif is_polars_dtype(name):
return wrap_expr(_dtype_cols([name]))
elif isinstance(name, Iterable):
names = list(name)
if not names:
return wrap_expr(_cols(names))
item = names[0]
if isinstance(item, str):
return wrap_expr(_cols(names))
elif is_polars_dtype(item):
return wrap_expr(_dtype_cols(names))
else:
raise TypeError(
"Invalid input for `col`. Expected iterable of type `str` or `DataType`,"
f" got iterable of type {type(item)!r}"
)
else:
raise TypeError(
f"Invalid input for `col`. Expected `str` or `DataType`, got {type(name)!r}"
)
def element() -> Expr:
"""
Alias for an element being evaluated in an `eval` expression.
Examples
--------
A horizontal rank computation by taking the elements of a list
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2]})
>>> df.with_columns(
... pl.concat_list(["a", "b"]).arr.eval(pl.element().rank()).alias("rank")
... )
shape: (3, 3)
┌─────┬─────┬────────────┐
│ a ┆ b ┆ rank │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ list[f32] │
╞═════╪═════╪════════════╡
│ 1 ┆ 4 ┆ [1.0, 2.0] │
│ 8 ┆ 5 ┆ [2.0, 1.0] │
│ 3 ┆ 2 ┆ [2.0, 1.0] │
└─────┴─────┴────────────┘
A mathematical operation on array elements
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2]})
>>> df.with_columns(
... pl.concat_list(["a", "b"]).arr.eval(pl.element() * 2).alias("a_b_doubled")
... )
shape: (3, 3)
┌─────┬─────┬─────────────┐
│ a ┆ b ┆ a_b_doubled │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ list[i64] │
╞═════╪═════╪═════════════╡
│ 1 ┆ 4 ┆ [2, 8] │
│ 8 ┆ 5 ┆ [16, 10] │
│ 3 ┆ 2 ┆ [6, 4] │
└─────┴─────┴─────────────┘
"""
return col("")
@overload
def count(column: str) -> Expr:
...
@overload
def count(column: Series) -> int:
...
@overload
def count(column: None = None) -> Expr:
...
def count(column: str | Series | None = None) -> Expr | int:
"""
Count the number of values in this column/context.
Parameters
----------
column
If dtype is:
* ``pl.Series`` : count the values in the series.
* ``str`` : count the values in this column.
* ``None`` : count the number of values in this context.
Examples
--------
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
>>> df.select(pl.count())
shape: (1, 1)
┌───────┐
│ count │
│ --- │
│ u32 │
╞═══════╡
│ 3 │
└───────┘
>>> df.groupby("c", maintain_order=True).agg(pl.count())
shape: (2, 2)
┌─────┬───────┐
│ c ┆ count │
│ --- ┆ --- │
│ str ┆ u32 │
╞═════╪═══════╡
│ foo ┆ 2 │
│ bar ┆ 1 │
└─────┴───────┘
"""
if column is None:
return wrap_expr(_count())
if isinstance(column, pl.Series):
return column.len()
return col(column).count()
def implode(name: str) -> Expr:
"""
Aggregate all column values into a list.
Parameters
----------
name
Name of the column that should be imploded.
"""
return col(name).implode()
def list_(name: str) -> Expr:
"""
Aggregate to list.
.. deprecated:: 0.17.3
``list`` will be removed in favor of ``implode``.
Parameters
----------
name
Name of the column that should be aggregated into a list.
"""
warnings.warn(
"`pl.list` is deprecated, please use `pl.implode` instead.",
DeprecationWarning,
stacklevel=find_stacklevel(),
)
return col(name).implode()
@overload
def std(column: str, ddof: int = 1) -> Expr:
...
@overload
def std(column: Series, ddof: int = 1) -> float | None:
...
def std(column: str | Series, ddof: int = 1) -> Expr | float | None:
"""
Get the standard deviation.
Parameters
----------
column
Column to get the standard deviation from.
ddof
“Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
where N represents the number of elements.
By default ddof is 1.
Examples
--------
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
>>> df.select(pl.std("a"))
shape: (1, 1)
┌──────────┐
│ a │
│ --- │
│ f64 │
╞══════════╡
│ 3.605551 │
└──────────┘
>>> df["a"].std()
3.605551275463989
"""
if isinstance(column, pl.Series):
return column.std(ddof)
return col(column).std(ddof)
@overload
def var(column: str, ddof: int = 1) -> Expr:
...
@overload
def var(column: Series, ddof: int = 1) -> float | None:
...
def var(column: str | Series, ddof: int = 1) -> Expr | float | None:
"""
Get the variance.
Parameters
----------
column
Column to get the variance of.
ddof
“Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
where N represents the number of elements.
By default ddof is 1.
Examples
--------
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
>>> df.select(pl.var("a"))
shape: (1, 1)
┌──────┐
│ a │
│ --- │
│ f64 │
╞══════╡
│ 13.0 │
└──────┘
>>> df["a"].var()
13.0
"""
if isinstance(column, pl.Series):
return column.var(ddof)
return col(column).var(ddof)
@overload
def max(exprs: Series) -> PythonLiteral | None: # type: ignore[misc]
...
@overload
def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr:
...
def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | Any:
"""
Get the maximum value.
If a single string is passed, this is an alias for ``pl.col(name).max()``.
If a single Series is passed, this is an alias for ``Series.max()``.
Otherwise, this function computes the maximum value horizontally across multiple
columns.
Parameters
----------
exprs
Column(s) to use in the aggregation. Accepts expression input. Strings are
parsed as column names, other non-expression inputs are parsed as literals.
*more_exprs
Additional columns to use in the aggregation, specified as positional arguments.
Examples
--------
Get the maximum value by row by passing multiple columns/expressions.
>>> df = pl.DataFrame(
... {
... "a": [1, 8, 3],
... "b": [4, 5, 2],
... "c": ["foo", "bar", "foo"],
... }
... )
>>> df.select(pl.max("a", "b"))
shape: (3, 1)
┌─────┐
│ max │
│ --- │
│ i64 │
╞═════╡
│ 4 │
│ 8 │
│ 3 │
└─────┘
Get the maximum value of a column by passing a single column name.
>>> df.select(pl.max("a"))
shape: (1, 1)
┌─────┐
│ a │
│ --- │
│ i64 │
╞═════╡
│ 8 │
└─────┘
Get column-wise maximums for multiple columns by passing a regular expression,
or call ``.max()`` on a multi-column expression instead.
>>> df.select(pl.max("^a|b$"))
shape: (1, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 8 ┆ 5 │
└─────┴─────┘
>>> df.select(pl.col("a", "b").max())
shape: (1, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 8 ┆ 5 │
└─────┴─────┘
"""
if not more_exprs:
if isinstance(exprs, pl.Series):
return exprs.max()
elif isinstance(exprs, str):
return col(exprs).max()
exprs = selection_to_pyexpr_list(exprs)
if more_exprs:
exprs.extend(selection_to_pyexpr_list(more_exprs))
return wrap_expr(_max_exprs(exprs))
@overload
def min(exprs: Series) -> PythonLiteral | None: # type: ignore[misc]
...
@overload
def min(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr:
...
def min(
exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr
) -> Expr | PythonLiteral | None:
"""
Get the minimum value.
If a single string is passed, this is an alias for ``pl.col(name).min()``.
If a single Series is passed, this is an alias for ``Series.min()``.
Otherwise, this function computes the minimum value horizontally across multiple
columns.
Parameters
----------
exprs
Column(s) to use in the aggregation. Accepts expression input. Strings are
parsed as column names, other non-expression inputs are parsed as literals.
*more_exprs
Additional columns to use in the aggregation, specified as positional arguments.
Examples
--------
Get the minimum value by row by passing multiple columns/expressions.
>>> df = pl.DataFrame(
... {
... "a": [1, 8, 3],
... "b": [4, 5, 2],
... "c": ["foo", "bar", "foo"],
... }
... )
>>> df.select(pl.min("a", "b"))
shape: (3, 1)
┌─────┐
│ min │
│ --- │
│ i64 │
╞═════╡
│ 1 │
│ 5 │
│ 2 │
└─────┘
Get the minimum value of a column by passing a single column name.
>>> df.select(pl.min("a"))
shape: (1, 1)
┌─────┐
│ a │
│ --- │
│ i64 │
╞═════╡
│ 1 │
└─────┘
Get column-wise minimums for multiple columns by passing a regular expression,
or call ``.min()`` on a multi-column expression instead.
>>> df.select(pl.min("^a|b$"))
shape: (1, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 2 │
└─────┴─────┘
>>> df.select(pl.col("a", "b").min())
shape: (1, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 2 │
└─────┴─────┘
"""
if not more_exprs:
if isinstance(exprs, pl.Series):
return exprs.min()
elif isinstance(exprs, str):
return col(exprs).min()
exprs = selection_to_pyexpr_list(exprs)
if more_exprs:
exprs.extend(selection_to_pyexpr_list(more_exprs))
return wrap_expr(_min_exprs(exprs))
@overload
def sum(exprs: Series) -> int | float: # type: ignore[misc]
...
@overload
def sum(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr:
...
@deprecated_alias(column="exprs")
def sum(
exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr
) -> Expr | int | float:
"""
Sum all values.
If a single string is passed, this is an alias for ``pl.col(name).sum()``.
If a single Series is passed, this is an alias for ``Series.sum()``.
Otherwise, this function computes the sum horizontally across multiple columns.
Parameters
----------
exprs
Column(s) to use in the aggregation. Accepts expression input. Strings are
parsed as column names, other non-expression inputs are parsed as literals.
*more_exprs
Additional columns to use in the aggregation, specified as positional arguments.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [1, 2],
... "b": [3, 4],
... "c": [5, 6],
... }
... )
>>> df
shape: (2, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 1 ┆ 3 ┆ 5 │
│ 2 ┆ 4 ┆ 6 │
└─────┴─────┴─────┘
Sum a column by name:
>>> df.select(pl.sum("a"))
shape: (1, 1)
┌─────┐
│ a │
│ --- │
│ i64 │
╞═════╡
│ 3 │
└─────┘
Sum a list of columns/expressions horizontally:
>>> df.with_columns(pl.sum("a", "c"))
shape: (2, 4)
┌─────┬─────┬─────┬─────┐
│ a ┆ b ┆ c ┆ sum │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╪═════╡
│ 1 ┆ 3 ┆ 5 ┆ 6 │
│ 2 ┆ 4 ┆ 6 ┆ 8 │
└─────┴─────┴─────┴─────┘
Sum a series:
>>> pl.sum(df.get_column("a"))
3
To aggregate the sums for more than one column/expression use ``pl.col(list).sum()``
or a regular expression selector like ``pl.sum(regex)``:
>>> df.select(pl.col("a", "c").sum())
shape: (1, 2)
┌─────┬─────┐
│ a ┆ c │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 3 ┆ 11 │
└─────┴─────┘
>>> df.select(pl.sum("^.*[bc]$"))
shape: (1, 2)
┌─────┬─────┐
│ b ┆ c │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 7 ┆ 11 │
└─────┴─────┘
"""
if not more_exprs:
if isinstance(exprs, pl.Series):
return exprs.sum()
elif isinstance(exprs, str):
return col(exprs).sum()
exprs = selection_to_pyexpr_list(exprs)
if more_exprs:
exprs.extend(selection_to_pyexpr_list(more_exprs))
return wrap_expr(_sum_exprs(exprs))
@overload
def mean(column: str) -> Expr:
...
@overload
def mean(column: Series) -> float:
...
def mean(column: str | Series) -> Expr | float | None:
"""
Get the mean value.
Examples
--------
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
>>> df.select(pl.mean("a"))
shape: (1, 1)
┌─────┐
│ a │
│ --- │
│ f64 │
╞═════╡
│ 4.0 │
└─────┘
>>> pl.mean(df["a"])
4.0
"""
if isinstance(column, pl.Series):
return column.mean()
return col(column).mean()
@overload
def avg(column: str) -> Expr:
...
@overload
def avg(column: Series) -> float:
...
def avg(column: str | Series) -> Expr | float:
"""
Alias for mean.
Examples
--------
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
>>> df.select(pl.avg("a"))
shape: (1, 1)
┌─────┐
│ a │
│ --- │
│ f64 │
╞═════╡
│ 4.0 │
└─────┘
>>> pl.avg(df["a"])
4.0
"""
return mean(column)
@overload
def median(column: str) -> Expr:
...
@overload
def median(column: Series) -> float | int:
...
def median(column: str | Series) -> Expr | float | int | None:
"""
Get the median value.
Examples
--------
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
>>> df.select(pl.median("a"))
shape: (1, 1)
┌─────┐
│ a │
│ --- │
│ f64 │
╞═════╡
│ 3.0 │
└─────┘
>>> pl.median(df["a"])
3.0
"""
if isinstance(column, pl.Series):
return column.median()
return col(column).median()
@overload
def n_unique(column: str) -> Expr:
...
@overload
def n_unique(column: Series) -> int:
...
def n_unique(column: str | Series) -> Expr | int:
"""
Count unique values.
Examples
--------
>>> df = pl.DataFrame({"a": [1, 8, 1], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
>>> df.select(pl.n_unique("a"))
shape: (1, 1)
┌─────┐
│ a │
│ --- │
│ u32 │
╞═════╡
│ 2 │
└─────┘
>>> pl.n_unique(df["a"])
2
"""
if isinstance(column, pl.Series):
return column.n_unique()
return col(column).n_unique()
def approx_unique(column: str | Expr) -> Expr:
"""
Approx count unique values.
This is done using the HyperLogLog++ algorithm for cardinality estimation.
Parameters
----------
column
Column name or Series.
Examples
--------
>>> df = pl.DataFrame({"a": [1, 8, 1], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
>>> df.select(pl.approx_unique("a"))
shape: (1, 1)
┌─────┐
│ a │
│ --- │
│ u32 │
╞═════╡
│ 2 │
└─────┘
"""
if isinstance(column, pl.Expr):
return column.approx_unique()
return col(column).approx_unique()
@overload
def first(column: str) -> Expr:
...
@overload
def first(column: Series) -> Any:
...
@overload
def first(column: None = None) -> Expr:
...
def first(column: str | Series | None = None) -> Expr | Any:
"""
Get the first value.
Depending on the input type this function does different things:
input:
- None -> expression to take first column of a context.
- str -> syntactic sugar for `pl.col(..).first()`
- Series -> Take first value in `Series`
Examples
--------
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
>>> df.select(pl.first())
shape: (3, 1)
┌─────┐
│ a │
│ --- │
│ i64 │
╞═════╡
│ 1 │
│ 8 │
│ 3 │
└─────┘
>>> df.select(pl.first("a"))
shape: (1, 1)
┌─────┐
│ a │
│ --- │
│ i64 │
╞═════╡
│ 1 │
└─────┘
>>> pl.first(df["a"])
1
"""
if column is None:
return wrap_expr(_first())
if isinstance(column, pl.Series):
if column.len() > 0:
return column[0]
else:
raise IndexError("The series is empty, so no first value can be returned.")
return col(column).first()
@overload
def last(column: str) -> Expr:
...
@overload
def last(column: Series) -> Any:
...
@overload
def last(column: None = None) -> Expr:
...
def last(column: str | Series | None = None) -> Expr:
"""
Get the last value.
Depending on the input type this function does different things:
- None -> expression to take last column of a context.
- str -> syntactic sugar for `pl.col(..).last()`
- Series -> Take last value in `Series`
Examples
--------
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
>>> df.select(pl.last())
shape: (3, 1)
┌─────┐
│ c │
│ --- │
│ str │
╞═════╡
│ foo │
│ bar │
│ foo │
└─────┘
>>> df.select(pl.last("a"))
shape: (1, 1)
┌─────┐
│ a │
│ --- │
│ i64 │
╞═════╡
│ 3 │
└─────┘
>>> pl.last(df["a"])
3
"""
if column is None:
return wrap_expr(_last())
if isinstance(column, pl.Series):
if column.len() > 0:
return column[-1]
else:
raise IndexError("The series is empty, so no last value can be returned,")
return col(column).last()
@overload
def head(column: str, n: int = ...) -> Expr:
...
@overload
def head(column: Series, n: int = ...) -> Series:
...
def head(column: str | Series, n: int = 10) -> Expr | Series:
"""
Get the first `n` rows.
Parameters
----------
column
Column name or Series.
n
Number of rows to return.
Examples
--------
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
>>> df.select(pl.head("a"))
shape: (3, 1)
┌─────┐
│ a │
│ --- │
│ i64 │
╞═════╡
│ 1 │
│ 8 │
│ 3 │
└─────┘
>>> df.select(pl.head("a", 2))
shape: (2, 1)
┌─────┐
│ a │
│ --- │
│ i64 │
╞═════╡
│ 1 │
│ 8 │
└─────┘
>>> pl.head(df["a"], 2)
shape: (2,)
Series: 'a' [i64]
[
1
8
]
"""
if isinstance(column, pl.Series):
return column.head(n)
return col(column).head(n)
@overload
def tail(column: str, n: int = ...) -> Expr:
...
@overload
def tail(column: Series, n: int = ...) -> Series:
...
def tail(column: str | Series, n: int = 10) -> Expr | Series:
"""
Get the last `n` rows.
Parameters
----------
column
Column name or Series.
n
Number of rows to return.
Examples
--------
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
>>> df.select(pl.tail("a"))
shape: (3, 1)
┌─────┐
│ a │
│ --- │
│ i64 │
╞═════╡
│ 1 │
│ 8 │
│ 3 │
└─────┘
>>> df.select(pl.tail("a", 2))
shape: (2, 1)
┌─────┐
│ a │
│ --- │
│ i64 │
╞═════╡
│ 8 │
│ 3 │
└─────┘
>>> pl.tail(df["a"], 2)
shape: (2,)
Series: 'a' [i64]
[
8
3
]
"""
if isinstance(column, pl.Series):
return column.tail(n)
return col(column).tail(n)
[docs]def lit(
value: Any, dtype: PolarsDataType | None = None, *, allow_object: bool = False
) -> Expr:
"""
Return an expression representing a literal value.
Parameters
----------
value
Value that should be used as a `literal`.
dtype
Optionally define a dtype.
allow_object
If type is unknown use an 'object' type.
By default, we will raise a `ValueException`
if the type is unknown.
Examples
--------
Literal scalar values:
>>> pl.lit(1) # doctest: +IGNORE_RESULT
>>> pl.lit(5.5) # doctest: +IGNORE_RESULT
>>> pl.lit(None) # doctest: +IGNORE_RESULT
>>> pl.lit("foo_bar") # doctest: +IGNORE_RESULT
>>> pl.lit(date(2021, 1, 20)) # doctest: +IGNORE_RESULT
>>> pl.lit(datetime(2023, 3, 31, 10, 30, 45)) # doctest: +IGNORE_RESULT
Literal list/Series data (1D):
>>> pl.lit([1, 2, 3]) # doctest: +IGNORE_RESULT
>>> pl.lit(pl.Series("x", [1, 2, 3])) # doctest: +IGNORE_RESULT
Literal list/Series data (2D):
>>> pl.lit([[1, 2], [3, 4]]) # doctest: +IGNORE_RESULT
>>> pl.lit(pl.Series("y", [[1, 2], [3, 4]])) # doctest: +IGNORE_RESULT
Expected datatypes
- ''pl.lit([])'' -> empty Series Float32
- ''pl.lit([1, 2, 3])'' -> Series Int64
- ''pl.lit([[]])''-> empty Series List<Null>
- ''pl.lit([[1, 2, 3]])'' -> Series List<i64>
- ''pl.lit(None)'' -> Series Null
"""
time_unit: TimeUnit
if isinstance(value, datetime):
time_unit = "us" if dtype is None else getattr(dtype, "time_unit", "us")
time_zone = (
value.tzinfo
if getattr(dtype, "time_zone", None) is None
else getattr(dtype, "time_zone", None)
)
if value.tzinfo is not None and getattr(dtype, "time_zone", None) is not None:
raise TypeError(
"Cannot cast tz-aware value to tz-aware dtype. "
"Please drop the time zone from the dtype."
)
e = lit(_datetime_to_pl_timestamp(value, time_unit)).cast(Datetime(time_unit))
if time_zone is not None:
return e.dt.replace_time_zone(str(time_zone))
else:
return e
elif isinstance(value, timedelta):
time_unit = "us" if dtype is None else getattr(dtype, "time_unit", "us")
return lit(_timedelta_to_pl_timedelta(value, time_unit)).cast(
Duration(time_unit)
)
elif isinstance(value, time):
return lit(_time_to_pl_time(value)).cast(Time)
elif isinstance(value, date):
return lit(datetime(value.year, value.month, value.day)).cast(Date)
elif isinstance(value, pl.Series):
name = value.name
value = value._s
e = wrap_expr(_lit(value, allow_object))
if name == "":
return e
return e.alias(name)
elif (_check_for_numpy(value) and isinstance(value, np.ndarray)) or isinstance(
value, (list, tuple)
):
return lit(pl.Series("", value))
elif dtype:
return wrap_expr(_lit(value, allow_object)).cast(dtype)
try:
# numpy literals like np.float32(0) have item/dtype
item = value.item()
# numpy item() is py-native datetime/timedelta when units < 'ns'
if isinstance(item, (datetime, timedelta)):
return lit(item)
# handle 'ns' units
if isinstance(item, int) and hasattr(value, "dtype"):
dtype_name = value.dtype.name
if dtype_name.startswith(("datetime64[", "timedelta64[")):
time_unit = dtype_name[11:-1]
return lit(item).cast(
Datetime(time_unit)
if dtype_name.startswith("date")
else Duration(time_unit)
)
except AttributeError:
item = value
return wrap_expr(_lit(item, allow_object))
@overload
def cumsum(exprs: Series) -> Series: # type: ignore[misc]
...
@overload
def cumsum(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr:
...
@deprecated_alias(column="exprs")
def cumsum(
exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr
) -> Expr | Series:
"""
Cumulatively sum all values.
If a single string is passed, this is an alias for ``pl.col(name).cumsum()``.
If a single Series is passed, this is an alias for ``Series.cumsum()``.
Otherwise, this function computes the cumulative sum horizontally across multiple
columns.
Parameters
----------
exprs
Column(s) to use in the aggregation. Accepts expression input. Strings are
parsed as column names, other non-expression inputs are parsed as literals.
*more_exprs
Additional columns to use in the aggregation, specified as positional arguments.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [1, 2],
... "b": [3, 4],
... "c": [5, 6],
... }
... )
>>> df
shape: (2, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 1 ┆ 3 ┆ 5 │
│ 2 ┆ 4 ┆ 6 │
└─────┴─────┴─────┘
Cumulatively sum a column by name:
>>> df.select(pl.cumsum("a"))
shape: (2, 1)
┌─────┐
│ a │
│ --- │
│ i64 │
╞═════╡
│ 1 │
│ 3 │
└─────┘
Cumulatively sum a list of columns/expressions horizontally:
>>> df.with_columns(pl.cumsum("a", "c"))
shape: (2, 4)
┌─────┬─────┬─────┬───────────┐
│ a ┆ b ┆ c ┆ cumsum │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ struct[2] │
╞═════╪═════╪═════╪═══════════╡
│ 1 ┆ 3 ┆ 5 ┆ {1,6} │
│ 2 ┆ 4 ┆ 6 ┆ {2,8} │
└─────┴─────┴─────┴───────────┘
"""
if not more_exprs:
if isinstance(exprs, pl.Series):
return exprs.cumsum()
elif isinstance(exprs, str):
return col(exprs).cumsum()
exprs = selection_to_pyexpr_list(exprs)
if more_exprs:
exprs.extend(selection_to_pyexpr_list(more_exprs))
# (Expr): use u32 as that will not cast to float as eagerly
exprs_wrapped = [wrap_expr(e) for e in exprs]
return cumfold(lit(0).cast(UInt32), lambda a, b: a + b, exprs_wrapped).alias(
"cumsum"
)
def spearman_rank_corr(
a: str | Expr, b: str | Expr, ddof: int = 1, *, propagate_nans: bool = False
) -> Expr:
"""
Compute the spearman rank correlation between two columns.
Missing data will be excluded from the computation.
.. deprecated:: 0.16.10
``spearman_rank_corr`` will be removed in favor of
``corr(..., method="spearman")``.
Parameters
----------
a
Column name or Expression.
b
Column name or Expression.
ddof
“Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
where N represents the number of elements.
By default ddof is 1.
propagate_nans
If `True` any `NaN` encountered will lead to `NaN` in the output.
Defaults to `False` where `NaN` are regarded as larger than any finite number
and thus lead to the highest rank.
See Also
--------
corr
Examples
--------
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
>>> df.select(pl.spearman_rank_corr("a", "b")) # doctest: +SKIP
shape: (1, 1)
┌─────┐
│ a │
│ --- │
│ f64 │
╞═════╡
│ 0.5 │
└─────┘
"""
warnings.warn(
"`spearman_rank_corr()` is deprecated in favor of `corr()`",
DeprecationWarning,
stacklevel=find_stacklevel(),
)
if isinstance(a, str):
a = col(a)
if isinstance(b, str):
b = col(b)
return wrap_expr(_spearman_rank_corr(a._pyexpr, b._pyexpr, ddof, propagate_nans))
def pearson_corr(a: str | Expr, b: str | Expr, ddof: int = 1) -> Expr:
"""
Compute the pearson's correlation between two columns.
.. deprecated:: 0.16.10
``pearson_corr`` will be removed in favor of ``corr(..., method="pearson")``.
Parameters
----------
a
Column name or Expression.
b
Column name or Expression.
ddof
“Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
where N represents the number of elements.
By default ddof is 1.
See Also
--------
corr
Examples
--------
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
>>> df.select(pl.pearson_corr("a", "b")) # doctest: +SKIP
shape: (1, 1)
┌──────────┐
│ a │
│ --- │
│ f64 │
╞══════════╡
│ 0.544705 │
└──────────┘
"""
warnings.warn(
"`pearson_corr()` is deprecated in favor of `corr()`",
DeprecationWarning,
stacklevel=find_stacklevel(),
)
if isinstance(a, str):
a = col(a)
if isinstance(b, str):
b = col(b)
return wrap_expr(_pearson_corr(a._pyexpr, b._pyexpr, ddof))
def corr(
a: str | Expr,
b: str | Expr,
*,
method: CorrelationMethod = "pearson",
ddof: int = 1,
propagate_nans: bool = False,
) -> Expr:
"""
Compute the pearson's or spearman rank correlation correlation between two columns.
Parameters
----------
a
Column name or Expression.
b
Column name or Expression.
ddof
“Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
where N represents the number of elements.
By default ddof is 1.
method : {'pearson', 'spearman'}
Correlation method.
propagate_nans
If `True` any `NaN` encountered will lead to `NaN` in the output.
Defaults to `False` where `NaN` are regarded as larger than any finite number
and thus lead to the highest rank.
Examples
--------
Pearson's correlation:
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
>>> df.select(pl.corr("a", "b"))
shape: (1, 1)
┌──────────┐
│ a │
│ --- │
│ f64 │
╞══════════╡
│ 0.544705 │
└──────────┘
Spearman rank correlation:
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
>>> df.select(pl.corr("a", "b", method="spearman"))
shape: (1, 1)
┌─────┐
│ a │
│ --- │
│ f64 │
╞═════╡
│ 0.5 │
└─────┘
"""
if isinstance(a, str):
a = col(a)
if isinstance(b, str):
b = col(b)
if method == "pearson":
return wrap_expr(_pearson_corr(a._pyexpr, b._pyexpr, ddof))
elif method == "spearman":
return wrap_expr(
_spearman_rank_corr(a._pyexpr, b._pyexpr, ddof, propagate_nans)
)
else:
raise ValueError(
f"method must be one of {{'pearson', 'spearman'}}, got {method!r}"
)
def cov(a: str | Expr, b: str | Expr) -> Expr:
"""
Compute the covariance between two columns/ expressions.
Parameters
----------
a
Column name or Expression.
b
Column name or Expression.
Examples
--------
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
>>> df.select(pl.cov("a", "b"))
shape: (1, 1)
┌─────┐
│ a │
│ --- │
│ f64 │
╞═════╡
│ 3.0 │
└─────┘
"""
if isinstance(a, str):
a = col(a)
if isinstance(b, str):
b = col(b)
return wrap_expr(_cov(a._pyexpr, b._pyexpr))
def map(
exprs: Sequence[str] | Sequence[Expr],
function: Callable[[Sequence[Series]], Series],
return_dtype: PolarsDataType | None = None,
) -> Expr:
"""
Map a custom function over multiple columns/expressions.
Produces a single Series result.
Parameters
----------
exprs
Input Series to f
function
Function to apply over the input
return_dtype
dtype of the output Series
Returns
-------
Expr
Examples
--------
>>> def test_func(a, b, c):
... return a + b + c
...
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 3, 4],
... "b": [4, 5, 6, 7],
... }
... )
>>>
>>> df.with_columns(
... (
... pl.struct(["a", "b"]).map(
... lambda x: test_func(x.struct.field("a"), x.struct.field("b"), 1)
... )
... ).alias("a+b+c")
... )
shape: (4, 3)
┌─────┬─────┬───────┐
│ a ┆ b ┆ a+b+c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═══════╡
│ 1 ┆ 4 ┆ 6 │
│ 2 ┆ 5 ┆ 8 │
│ 3 ┆ 6 ┆ 10 │
│ 4 ┆ 7 ┆ 12 │
└─────┴─────┴───────┘
"""
exprs = selection_to_pyexpr_list(exprs)
return wrap_expr(
_map_mul(
exprs, function, return_dtype, apply_groups=False, returns_scalar=False
)
)
def apply(
exprs: Sequence[str | Expr],
function: Callable[[Sequence[Series]], Series | Any],
return_dtype: PolarsDataType | None = None,
*,
returns_scalar: bool = True,
) -> Expr:
"""
Apply a custom/user-defined function (UDF) in a GroupBy context.
Depending on the context it has the following behavior:
* Select
Don't use apply, use `map`
* GroupBy
expected type `f`: Callable[[Series], Series]
Applies a python function over each group.
Parameters
----------
exprs
Input Series to f
function
Function to apply over the input
return_dtype
dtype of the output Series
returns_scalar
If the function returns a single scalar as output.
Returns
-------
Expr
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [7, 2, 3, 4],
... "b": [2, 5, 6, 7],
... }
... )
>>> df
shape: (4, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 7 ┆ 2 │
│ 2 ┆ 5 │
│ 3 ┆ 6 │
│ 4 ┆ 7 │
└─────┴─────┘
Calculate product of ``a``.
>>> df.with_columns(pl.col("a").apply(lambda x: x * x).alias("product_a"))
shape: (4, 3)
┌─────┬─────┬───────────┐
│ a ┆ b ┆ product_a │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═══════════╡
│ 7 ┆ 2 ┆ 49 │
│ 2 ┆ 5 ┆ 4 │
│ 3 ┆ 6 ┆ 9 │
│ 4 ┆ 7 ┆ 16 │
└─────┴─────┴───────────┘
"""
exprs = selection_to_pyexpr_list(exprs)
return wrap_expr(
_map_mul(
exprs,
function,
return_dtype,
apply_groups=True,
returns_scalar=returns_scalar,
)
)
def fold(
acc: IntoExpr,
function: Callable[[Series, Series], Series],
exprs: Sequence[Expr | str] | Expr,
) -> Expr:
"""
Accumulate over multiple columns horizontally/ row wise with a left fold.
Parameters
----------
acc
Accumulator Expression. This is the value that will be initialized when the fold
starts. For a sum this could for instance be lit(0).
function
Function to apply over the accumulator and the value.
Fn(acc, value) -> new_value
exprs
Expressions to aggregate over. May also be a wildcard expression.
Notes
-----
If you simply want the first encountered expression as accumulator,
consider using ``reduce``.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 3],
... "b": [3, 4, 5],
... "c": [5, 6, 7],
... }
... )
>>> df
shape: (3, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 1 ┆ 3 ┆ 5 │
│ 2 ┆ 4 ┆ 6 │
│ 3 ┆ 5 ┆ 7 │
└─────┴─────┴─────┘
Horizontally sum over all columns and add 1.
>>> df.select(
... pl.fold(
... acc=pl.lit(1), function=lambda acc, x: acc + x, exprs=pl.col("*")
... ).alias("sum"),
... )
shape: (3, 1)
┌─────┐
│ sum │
│ --- │
│ i64 │
╞═════╡
│ 10 │
│ 13 │
│ 16 │
└─────┘
You can also apply a condition/predicate on all columns:
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 3],
... "b": [0, 1, 2],
... }
... )
>>> df
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 0 │
│ 2 ┆ 1 │
│ 3 ┆ 2 │
└─────┴─────┘
>>> df.filter(
... pl.fold(
... acc=pl.lit(True),
... function=lambda acc, x: acc & x,
... exprs=pl.col("*") > 1,
... )
... )
shape: (1, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 3 ┆ 2 │
└─────┴─────┘
"""
# in case of pl.col("*")
acc = expr_to_lit_or_expr(acc, str_to_lit=True)
if isinstance(exprs, pl.Expr):
exprs = [exprs]
exprs = selection_to_pyexpr_list(exprs)
return wrap_expr(_fold(acc._pyexpr, function, exprs))
def reduce(
function: Callable[[Series, Series], Series],
exprs: Sequence[Expr | str] | Expr,
) -> Expr:
"""
Accumulate over multiple columns horizontally/ row wise with a left fold.
Parameters
----------
function
Function to apply over the accumulator and the value.
Fn(acc, value) -> new_value
exprs
Expressions to aggregate over. May also be a wildcard expression.
Notes
-----
See ``fold`` for the version with an explicit accumulator.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 3],
... "b": [0, 1, 2],
... }
... )
>>> df
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 0 │
│ 2 ┆ 1 │
│ 3 ┆ 2 │
└─────┴─────┘
Horizontally sum over all columns.
>>> df.select(
... pl.reduce(function=lambda acc, x: acc + x, exprs=pl.col("*")).alias("sum"),
... )
shape: (3, 1)
┌─────┐
│ sum │
│ --- │
│ i64 │
╞═════╡
│ 1 │
│ 3 │
│ 5 │
└─────┘
"""
# in case of pl.col("*")
if isinstance(exprs, pl.Expr):
exprs = [exprs]
exprs = selection_to_pyexpr_list(exprs)
return wrap_expr(_reduce(function, exprs))
def cumfold(
acc: IntoExpr,
function: Callable[[Series, Series], Series],
exprs: Sequence[Expr | str] | Expr,
*,
include_init: bool = False,
) -> Expr:
"""
Cumulatively accumulate over multiple columns horizontally/ row wise with a left fold.
Every cumulative result is added as a separate field in a Struct column.
Parameters
----------
acc
Accumulator Expression. This is the value that will be initialized when the fold
starts. For a sum this could for instance be lit(0).
function
Function to apply over the accumulator and the value.
Fn(acc, value) -> new_value
exprs
Expressions to aggregate over. May also be a wildcard expression.
include_init
Include the initial accumulator state as struct field.
Notes
-----
If you simply want the first encountered expression as accumulator,
consider using ``cumreduce``.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 3],
... "b": [3, 4, 5],
... "c": [5, 6, 7],
... }
... )
>>> df
shape: (3, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 1 ┆ 3 ┆ 5 │
│ 2 ┆ 4 ┆ 6 │
│ 3 ┆ 5 ┆ 7 │
└─────┴─────┴─────┘
>>> df.select(
... pl.cumfold(
... acc=pl.lit(1), function=lambda acc, x: acc + x, exprs=pl.col("*")
... ).alias("cumfold"),
... )
shape: (3, 1)
┌───────────┐
│ cumfold │
│ --- │
│ struct[3] │
╞═══════════╡
│ {2,5,10} │
│ {3,7,13} │
│ {4,9,16} │
└───────────┘
""" # noqa: W505
# in case of pl.col("*")
acc = expr_to_lit_or_expr(acc, str_to_lit=True)
if isinstance(exprs, pl.Expr):
exprs = [exprs]
exprs = selection_to_pyexpr_list(exprs)
return wrap_expr(_cumfold(acc._pyexpr, function, exprs, include_init))
def cumreduce(
function: Callable[[Series, Series], Series],
exprs: Sequence[Expr | str] | Expr,
) -> Expr:
"""
Cumulatively accumulate over multiple columns horizontally/ row wise with a left fold.
Every cumulative result is added as a separate field in a Struct column.
Parameters
----------
function
Function to apply over the accumulator and the value.
Fn(acc, value) -> new_value
exprs
Expressions to aggregate over. May also be a wildcard expression.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 3],
... "b": [3, 4, 5],
... "c": [5, 6, 7],
... }
... )
>>> df
shape: (3, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 1 ┆ 3 ┆ 5 │
│ 2 ┆ 4 ┆ 6 │
│ 3 ┆ 5 ┆ 7 │
└─────┴─────┴─────┘
>>> df.select(
... pl.cumreduce(function=lambda acc, x: acc + x, exprs=pl.col("*")).alias(
... "cumreduce"
... ),
... )
shape: (3, 1)
┌───────────┐
│ cumreduce │
│ --- │
│ struct[3] │
╞═══════════╡
│ {1,4,9} │
│ {2,6,12} │
│ {3,8,15} │
└───────────┘
""" # noqa: W505
# in case of pl.col("*")
if isinstance(exprs, pl.Expr):
exprs = [exprs]
exprs = selection_to_pyexpr_list(exprs)
return wrap_expr(_cumreduce(function, exprs))
@overload
def any(exprs: Series) -> bool: # type: ignore[misc]
...
@overload
def any(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr:
...
@deprecated_alias(columns="exprs")
def any(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | bool:
"""
Evaluate a bitwise OR operation.
If a single string is passed, this is an alias for ``pl.col(name).any()``.
If a single Series is passed, this is an alias for ``Series.any()``.
Otherwise, this function computes the bitwise OR horizontally across multiple
columns.
Parameters
----------
exprs
Column(s) to use in the aggregation. Accepts expression input. Strings are
parsed as column names, other non-expression inputs are parsed as literals.
*more_exprs
Additional columns to use in the aggregation, specified as positional arguments.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [True, False, True],
... "b": [False, False, False],
... "c": [False, True, False],
... }
... )
>>> df
shape: (3, 3)
┌───────┬───────┬───────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ bool ┆ bool ┆ bool │
╞═══════╪═══════╪═══════╡
│ true ┆ false ┆ false │
│ false ┆ false ┆ true │
│ true ┆ false ┆ false │
└───────┴───────┴───────┘
Compares the values (in binary format) and return true if any value in the column
is true.
>>> df.select(pl.any("*"))
shape: (1, 3)
┌──────┬───────┬──────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ bool ┆ bool ┆ bool │
╞══════╪═══════╪══════╡
│ true ┆ false ┆ true │
└──────┴───────┴──────┘
Across multiple columns:
>>> df.select(pl.any("a", "b"))
shape: (3, 1)
┌───────┐
│ any │
│ --- │
│ bool │
╞═══════╡
│ true │
│ false │
│ true │
└───────┘
"""
if not more_exprs:
if isinstance(exprs, pl.Series):
return exprs.any()
elif isinstance(exprs, str):
return col(exprs).any()
exprs = selection_to_pyexpr_list(exprs)
if more_exprs:
exprs.extend(selection_to_pyexpr_list(more_exprs))
exprs_wrapped = [wrap_expr(e) for e in exprs]
return fold(
lit(False), lambda a, b: a.cast(bool) | b.cast(bool), exprs_wrapped
).alias("any")
@overload
def all(exprs: Series) -> bool: # type: ignore[misc]
...
@overload
def all(
exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr
) -> Expr:
...
@deprecated_alias(columns="exprs")
def all(
exprs: IntoExpr | Iterable[IntoExpr] | None = None, *more_exprs: IntoExpr
) -> Expr | bool:
"""
Either return an expression representing all columns, or evaluate a bitwise AND operation.
If no arguments are passed, this is an alias for ``pl.col("*")``.
If a single string is passed, this is an alias for ``pl.col(name).any()``.
If a single Series is passed, this is an alias for ``Series.any()``.
Otherwise, this function computes the bitwise AND horizontally across multiple
columns.
Parameters
----------
exprs
Column(s) to use in the aggregation. Accepts expression input. Strings are
parsed as column names, other non-expression inputs are parsed as literals.
*more_exprs
Additional columns to use in the aggregation, specified as positional arguments.
Examples
--------
Selecting all columns and calculating the sum:
>>> df = pl.DataFrame(
... {"a": [1, 2, 3], "b": ["hello", "foo", "bar"], "c": [1, 1, 1]}
... )
>>> df.select(pl.all().sum())
shape: (1, 3)
┌─────┬──────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ i64 │
╞═════╪══════╪═════╡
│ 6 ┆ null ┆ 3 │
└─────┴──────┴─────┘
Bitwise AND across multiple columns:
>>> df = pl.DataFrame(
... {
... "a": [True, False, True],
... "b": [True, False, False],
... "c": [False, True, False],
... }
... )
>>> df.select(pl.all("a", "b"))
shape: (3, 1)
┌───────┐
│ all │
│ --- │
│ bool │
╞═══════╡
│ true │
│ false │
│ false │
└───────┘
""" # noqa: W505
if not more_exprs:
if exprs is None:
return col("*")
elif isinstance(exprs, pl.Series):
return exprs.all()
elif isinstance(exprs, str):
return col(exprs).all()
exprs = selection_to_pyexpr_list(exprs)
if more_exprs:
exprs.extend(selection_to_pyexpr_list(more_exprs))
exprs_wrapped = [wrap_expr(e) for e in exprs]
return fold(
lit(True), lambda a, b: a.cast(bool) & b.cast(bool), exprs_wrapped
).alias("all")
[docs]def exclude(
columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType],
*more_columns: str | PolarsDataType,
) -> Expr:
"""
Represent all columns except for the given columns.
Syntactic sugar for ``pl.all().exclude(columns)``.
Parameters
----------
columns
The name or datatype of the column(s) to exclude. Accepts regular expression
input. Regular expressions should start with ``^`` and end with ``$``.
*more_columns
Additional names or datatypes of columns to exclude, specified as positional
arguments.
Examples
--------
Exclude by column name(s):
>>> df = pl.DataFrame(
... {
... "aa": [1, 2, 3],
... "ba": ["a", "b", None],
... "cc": [None, 2.5, 1.5],
... }
... )
>>> df.select(pl.exclude("ba"))
shape: (3, 2)
┌─────┬──────┐
│ aa ┆ cc │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════╪══════╡
│ 1 ┆ null │
│ 2 ┆ 2.5 │
│ 3 ┆ 1.5 │
└─────┴──────┘
Exclude by regex, e.g. removing all columns whose names end with the letter "a":
>>> df.select(pl.exclude("^.*a$"))
shape: (3, 1)
┌──────┐
│ cc │
│ --- │
│ f64 │
╞══════╡
│ null │
│ 2.5 │
│ 1.5 │
└──────┘
Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64:
>>> df.select(pl.exclude([pl.Int64, pl.Float64]))
shape: (3, 1)
┌──────┐
│ ba │
│ --- │
│ str │
╞══════╡
│ a │
│ b │
│ null │
└──────┘
"""
return col("*").exclude(columns, *more_columns)
def groups(column: str) -> Expr:
"""Syntactic sugar for `pl.col("foo").agg_groups()`."""
return col(column).agg_groups()
def quantile(
column: str,
quantile: float | Expr,
interpolation: RollingInterpolationMethod = "nearest",
) -> Expr:
"""
Syntactic sugar for `pl.col("foo").quantile(..)`.
Parameters
----------
column
Column name.
quantile
Quantile between 0.0 and 1.0.
interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'}
Interpolation method.
"""
return col(column).quantile(quantile, interpolation)
@overload
def arange(
start: int | Expr | Series,
end: int | Expr | Series,
step: int = ...,
*,
eager: Literal[False],
) -> Expr:
...
@overload
def arange(
start: int | Expr | Series,
end: int | Expr | Series,
step: int = ...,
*,
eager: Literal[True],
dtype: PolarsDataType | None = ...,
) -> Series:
...
@overload
def arange(
start: int | Expr | Series,
end: int | Expr | Series,
step: int = ...,
*,
eager: bool = ...,
dtype: PolarsDataType | None = ...,
) -> Expr | Series:
...
@deprecated_alias(low="start", high="end")
def arange(
start: int | Expr | Series,
end: int | Expr | Series,
step: int = 1,
*,
eager: bool = False,
dtype: PolarsDataType | None = None,
) -> Expr | Series:
"""
Create a range expression (or Series).
This can be used in a `select`, `with_column` etc. Be sure that the resulting
range size is equal to the length of the DataFrame you are collecting.
Examples
--------
>>> df.lazy().filter(pl.col("foo") < pl.arange(0, 100)).collect() # doctest: +SKIP
Parameters
----------
start
Lower bound of range.
end
Upper bound of range.
step
Step size of the range.
eager
Evaluate immediately and return a ``Series``. If set to ``False`` (default),
return an expression instead.
dtype
Apply an explicit integer dtype to the resulting expression (default is Int64).
"""
start = expr_to_lit_or_expr(start, str_to_lit=False)
end = expr_to_lit_or_expr(end, str_to_lit=False)
range_expr = wrap_expr(_arange(start._pyexpr, end._pyexpr, step))
if dtype is not None and dtype != Int64:
range_expr = range_expr.cast(dtype)
if not eager:
return range_expr
else:
return (
pl.DataFrame()
.select(range_expr)
.to_series()
.rename("arange", in_place=True)
)
def arg_sort_by(
exprs: IntoExpr | Iterable[IntoExpr],
*more_exprs: IntoExpr,
descending: bool | Sequence[bool] = False,
) -> Expr:
"""
Return the row indices that would sort the columns.
Parameters
----------
exprs
Column(s) to arg sort by. Accepts expression input. Strings are parsed as column
names.
*more_exprs
Additional columns to arg sort by, specified as positional arguments.
descending
Sort in descending order. When sorting by multiple columns, can be specified
per column by passing a sequence of booleans.
Examples
--------
Pass a single column name to compute the arg sort by that column.
>>> df = pl.DataFrame(
... {
... "a": [0, 1, 1, 0],
... "b": [3, 2, 3, 2],
... }
... )
>>> df.select(pl.arg_sort_by("a"))
shape: (4, 1)
┌─────┐
│ a │
│ --- │
│ u32 │
╞═════╡
│ 0 │
│ 3 │
│ 1 │
│ 2 │
└─────┘
Compute the arg sort by multiple columns by either passing a list of columns, or by
specifying each column as a positional argument.
>>> df.select(pl.arg_sort_by(["a", "b"], descending=True))
shape: (4, 1)
┌─────┐
│ a │
│ --- │
│ u32 │
╞═════╡
│ 2 │
│ 1 │
│ 0 │
│ 3 │
└─────┘
"""
exprs = selection_to_pyexpr_list(exprs)
if more_exprs:
exprs.extend(selection_to_pyexpr_list(more_exprs))
if isinstance(descending, bool):
descending = [descending] * len(exprs)
elif len(exprs) != len(descending):
raise ValueError(
f"the length of `descending` ({len(descending)}) does not match the length of `exprs` ({len(exprs)})"
)
return wrap_expr(_arg_sort_by(exprs, descending))
def duration(
*,
days: Expr | str | int | None = None,
seconds: Expr | str | int | None = None,
nanoseconds: Expr | str | int | None = None,
microseconds: Expr | str | int | None = None,
milliseconds: Expr | str | int | None = None,
minutes: Expr | str | int | None = None,
hours: Expr | str | int | None = None,
weeks: Expr | str | int | None = None,
) -> Expr:
"""
Create polars `Duration` from distinct time components.
Returns
-------
Expr of type `pl.Duration`
Examples
--------
>>> from datetime import datetime
>>> df = pl.DataFrame(
... {
... "dt": [datetime(2022, 1, 1), datetime(2022, 1, 2)],
... "add": [1, 2],
... }
... )
>>> print(df)
shape: (2, 2)
┌─────────────────────┬─────┐
│ dt ┆ add │
│ --- ┆ --- │
│ datetime[μs] ┆ i64 │
╞═════════════════════╪═════╡
│ 2022-01-01 00:00:00 ┆ 1 │
│ 2022-01-02 00:00:00 ┆ 2 │
└─────────────────────┴─────┘
>>> with pl.Config(tbl_width_chars=120):
... df.select(
... (pl.col("dt") + pl.duration(weeks="add")).alias("add_weeks"),
... (pl.col("dt") + pl.duration(days="add")).alias("add_days"),
... (pl.col("dt") + pl.duration(seconds="add")).alias("add_seconds"),
... (pl.col("dt") + pl.duration(milliseconds="add")).alias("add_millis"),
... (pl.col("dt") + pl.duration(hours="add")).alias("add_hours"),
... )
...
shape: (2, 5)
┌─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────────┬─────────────────────┐
│ add_weeks ┆ add_days ┆ add_seconds ┆ add_millis ┆ add_hours │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
╞═════════════════════╪═════════════════════╪═════════════════════╪═════════════════════════╪═════════════════════╡
│ 2022-01-08 00:00:00 ┆ 2022-01-02 00:00:00 ┆ 2022-01-01 00:00:01 ┆ 2022-01-01 00:00:00.001 ┆ 2022-01-01 01:00:00 │
│ 2022-01-16 00:00:00 ┆ 2022-01-04 00:00:00 ┆ 2022-01-02 00:00:02 ┆ 2022-01-02 00:00:00.002 ┆ 2022-01-02 02:00:00 │
└─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┴─────────────────────┘
""" # noqa: W505
if hours is not None:
hours = expr_to_lit_or_expr(hours, str_to_lit=False)._pyexpr
if minutes is not None:
minutes = expr_to_lit_or_expr(minutes, str_to_lit=False)._pyexpr
if seconds is not None:
seconds = expr_to_lit_or_expr(seconds, str_to_lit=False)._pyexpr
if milliseconds is not None:
milliseconds = expr_to_lit_or_expr(milliseconds, str_to_lit=False)._pyexpr
if microseconds is not None:
microseconds = expr_to_lit_or_expr(microseconds, str_to_lit=False)._pyexpr
if nanoseconds is not None:
nanoseconds = expr_to_lit_or_expr(nanoseconds, str_to_lit=False)._pyexpr
if days is not None:
days = expr_to_lit_or_expr(days, str_to_lit=False)._pyexpr
if weeks is not None:
weeks = expr_to_lit_or_expr(weeks, str_to_lit=False)._pyexpr
return wrap_expr(
_duration(
days,
seconds,
nanoseconds,
microseconds,
milliseconds,
minutes,
hours,
weeks,
)
)
def datetime_(
year: Expr | str | int,
month: Expr | str | int,
day: Expr | str | int,
hour: Expr | str | int | None = None,
minute: Expr | str | int | None = None,
second: Expr | str | int | None = None,
microsecond: Expr | str | int | None = None,
) -> Expr:
"""
Create a Polars literal expression of type Datetime.
Parameters
----------
year
column or literal.
month
column or literal, ranging from 1-12.
day
column or literal, ranging from 1-31.
hour
column or literal, ranging from 1-23.
minute
column or literal, ranging from 1-59.
second
column or literal, ranging from 1-59.
microsecond
column or literal, ranging from 1-999999.
Returns
-------
Expr of type `pl.Datetime`
"""
year_expr = expr_to_lit_or_expr(year, str_to_lit=False)
month_expr = expr_to_lit_or_expr(month, str_to_lit=False)
day_expr = expr_to_lit_or_expr(day, str_to_lit=False)
if hour is not None:
hour = expr_to_lit_or_expr(hour, str_to_lit=False)._pyexpr
if minute is not None:
minute = expr_to_lit_or_expr(minute, str_to_lit=False)._pyexpr
if second is not None:
second = expr_to_lit_or_expr(second, str_to_lit=False)._pyexpr
if microsecond is not None:
microsecond = expr_to_lit_or_expr(microsecond, str_to_lit=False)._pyexpr
return wrap_expr(
_datetime(
year_expr._pyexpr,
month_expr._pyexpr,
day_expr._pyexpr,
hour,
minute,
second,
microsecond,
)
)
def date_(
year: Expr | str | int,
month: Expr | str | int,
day: Expr | str | int,
) -> Expr:
"""
Create a Polars literal expression of type Date.
Parameters
----------
year
column or literal.
month
column or literal, ranging from 1-12.
day
column or literal, ranging from 1-31.
Returns
-------
Expr of type pl.Date
"""
return datetime_(year, month, day).cast(Date).alias("date")
def concat_str(
exprs: IntoExpr | Iterable[IntoExpr],
*more_exprs: IntoExpr,
separator: str = "",
) -> Expr:
"""
Horizontally concatenate columns into a single string column.
Operates in linear time.
Parameters
----------
exprs
Columns to concatenate into a single string column. Accepts expression input.
Strings are parsed as column names, other non-expression inputs are parsed as
literals. Non-``Utf8`` columns are cast to ``Utf8``.
*more_exprs
Additional columns to concatenate into a single string column, specified as
positional arguments.
separator
String that will be used to separate the values of each column.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 3],
... "b": ["dogs", "cats", None],
... "c": ["play", "swim", "walk"],
... }
... )
>>> df.with_columns(
... pl.concat_str(
... [
... pl.col("a") * 2,
... pl.col("b"),
... pl.col("c"),
... ],
... separator=" ",
... ).alias("full_sentence"),
... )
shape: (3, 4)
┌─────┬──────┬──────┬───────────────┐
│ a ┆ b ┆ c ┆ full_sentence │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str ┆ str │
╞═════╪══════╪══════╪═══════════════╡
│ 1 ┆ dogs ┆ play ┆ 2 dogs play │
│ 2 ┆ cats ┆ swim ┆ 4 cats swim │
│ 3 ┆ null ┆ walk ┆ null │
└─────┴──────┴──────┴───────────────┘
"""
exprs = selection_to_pyexpr_list(exprs)
if more_exprs:
exprs.extend(selection_to_pyexpr_list(more_exprs))
return wrap_expr(_concat_str(exprs, separator))
def format(f_string: str, *args: Expr | str) -> Expr:
"""
Format expressions as a string.
Parameters
----------
f_string
A string that with placeholders.
For example: "hello_{}" or "{}_world
args
Expression(s) that fill the placeholders
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": ["a", "b", "c"],
... "b": [1, 2, 3],
... }
... )
>>> df.select(
... [
... pl.format("foo_{}_bar_{}", pl.col("a"), "b").alias("fmt"),
... ]
... )
shape: (3, 1)
┌─────────────┐
│ fmt │
│ --- │
│ str │
╞═════════════╡
│ foo_a_bar_1 │
│ foo_b_bar_2 │
│ foo_c_bar_3 │
└─────────────┘
"""
if f_string.count("{}") != len(args):
raise ValueError("number of placeholders should equal the number of arguments")
exprs = []
arguments = iter(args)
for i, s in enumerate(f_string.split("{}")):
if i > 0:
e = expr_to_lit_or_expr(next(arguments), str_to_lit=False)
exprs.append(e)
if len(s) > 0:
exprs.append(lit(s))
return concat_str(exprs, separator="")
def concat_list(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr:
"""
Horizontally concatenate columns into a single list column.
Operates in linear time.
Parameters
----------
exprs
Columns to concatenate into a single list column. Accepts expression input.
Strings are parsed as column names, other non-expression inputs are parsed as
literals.
*more_exprs
Additional columns to concatenate into a single list column, specified as
positional arguments.
Examples
--------
Create lagged columns and collect them into a list. This mimics a rolling window.
>>> df = pl.DataFrame({"A": [1.0, 2.0, 9.0, 2.0, 13.0]})
>>> df = df.select([pl.col("A").shift(i).alias(f"A_lag_{i}") for i in range(3)])
>>> df.select(
... pl.concat_list([f"A_lag_{i}" for i in range(3)][::-1]).alias("A_rolling")
... )
shape: (5, 1)
┌───────────────────┐
│ A_rolling │
│ --- │
│ list[f64] │
╞═══════════════════╡
│ [null, null, 1.0] │
│ [null, 1.0, 2.0] │
│ [1.0, 2.0, 9.0] │
│ [2.0, 9.0, 2.0] │
│ [9.0, 2.0, 13.0] │
└───────────────────┘
"""
exprs = selection_to_pyexpr_list(exprs)
if more_exprs:
exprs.extend(selection_to_pyexpr_list(more_exprs))
return wrap_expr(_concat_list(exprs))
def collect_all(
lazy_frames: Sequence[LazyFrame],
*,
type_coercion: bool = True,
predicate_pushdown: bool = True,
projection_pushdown: bool = True,
simplify_expression: bool = True,
no_optimization: bool = False,
slice_pushdown: bool = True,
common_subplan_elimination: bool = True,
streaming: bool = False,
) -> list[DataFrame]:
"""
Collect multiple LazyFrames at the same time.
This runs all the computation graphs in parallel on Polars threadpool.
Parameters
----------
lazy_frames
A list of LazyFrames to collect.
type_coercion
Do type coercion optimization.
predicate_pushdown
Do predicate pushdown optimization.
projection_pushdown
Do projection pushdown optimization.
simplify_expression
Run simplify expressions optimization.
no_optimization
Turn off optimizations.
slice_pushdown
Slice pushdown optimization.
common_subplan_elimination
Will try to cache branching subplans that occur on self-joins or unions.
streaming
Run parts of the query in a streaming fashion (this is in an alpha state)
Returns
-------
List[DataFrame]
"""
if no_optimization:
predicate_pushdown = False
projection_pushdown = False
slice_pushdown = False
common_subplan_elimination = False
prepared = []
for lf in lazy_frames:
ldf = lf._ldf.optimization_toggle(
type_coercion,
predicate_pushdown,
projection_pushdown,
simplify_expression,
slice_pushdown,
common_subplan_elimination,
streaming,
)
prepared.append(ldf)
out = _collect_all(prepared)
# wrap the pydataframes into dataframe
result = [wrap_df(pydf) for pydf in out]
return result
def select(
exprs: IntoExpr | Iterable[IntoExpr] | None = None,
*more_exprs: IntoExpr,
**named_exprs: IntoExpr,
) -> DataFrame:
"""
Run polars expressions without a context.
This is syntactic sugar for running ``df.select`` on an empty DataFrame.
Parameters
----------
exprs
Expression or expressions to run.
*more_exprs
Additional expressions to run, specified as positional arguments.
**named_exprs
Additional expressions to run, specified as keyword arguments. The expressions
will be renamed to the keyword used.
Returns
-------
DataFrame
Examples
--------
>>> foo = pl.Series("foo", [1, 2, 3])
>>> bar = pl.Series("bar", [3, 2, 1])
>>> pl.select(pl.min([foo, bar]))
shape: (3, 1)
┌─────┐
│ min │
│ --- │
│ i64 │
╞═════╡
│ 1 │
│ 2 │
│ 1 │
└─────┘
"""
return pl.DataFrame().select(exprs, *more_exprs, **named_exprs)
@overload
def struct(
exprs: IntoExpr | Iterable[IntoExpr] = ...,
*more_exprs: IntoExpr,
eager: Literal[False] = ...,
schema: SchemaDict | None = ...,
**named_exprs: IntoExpr,
) -> Expr:
...
@overload
def struct(
exprs: IntoExpr | Iterable[IntoExpr] = ...,
*more_exprs: IntoExpr,
eager: Literal[True],
schema: SchemaDict | None = ...,
**named_exprs: IntoExpr,
) -> Series:
...
@overload
def struct(
exprs: IntoExpr | Iterable[IntoExpr] = ...,
*more_exprs: IntoExpr,
eager: bool,
schema: SchemaDict | None = ...,
**named_exprs: IntoExpr,
) -> Expr | Series:
...
def struct(
exprs: IntoExpr | Iterable[IntoExpr] = None,
*more_exprs: IntoExpr,
eager: bool = False,
schema: SchemaDict | None = None,
**named_exprs: IntoExpr,
) -> Expr | Series:
"""
Collect columns into a struct column.
Parameters
----------
exprs
Column(s) to collect into a struct column. Accepts expression input. Strings are
parsed as column names, other non-expression inputs are parsed as literals.
*more_exprs
Additional columns to collect into the struct column, specified as positional
arguments.
eager
Evaluate immediately and return a ``Series``. If set to ``False`` (default),
return an expression instead.
schema
Optional schema that explicitly defines the struct field dtypes.
**named_exprs
Additional columns to collect into the struct column, specified as keyword
arguments. The columns will be renamed to the keyword used.
Examples
--------
Collect all columns of a dataframe into a struct by passing ``pl.all()``.
>>> df = pl.DataFrame(
... {
... "int": [1, 2],
... "str": ["a", "b"],
... "bool": [True, None],
... "list": [[1, 2], [3]],
... }
... )
>>> df.select(pl.struct(pl.all()).alias("my_struct"))
shape: (2, 1)
┌─────────────────────┐
│ my_struct │
│ --- │
│ struct[4] │
╞═════════════════════╡
│ {1,"a",true,[1, 2]} │
│ {2,"b",null,[3]} │
└─────────────────────┘
Collect selected columns into a struct by either passing a list of columns, or by
specifying each column as a positional argument.
>>> df.select(pl.struct("int", False).alias("my_struct"))
shape: (2, 1)
┌───────────┐
│ my_struct │
│ --- │
│ struct[2] │
╞═══════════╡
│ {1,false} │
│ {2,false} │
└───────────┘
Use keyword arguments to easily name each struct field.
>>> df.select(pl.struct(p="int", q="bool").alias("my_struct")).schema
{'my_struct': Struct([Field('p', Int64), Field('q', Boolean)])}
"""
exprs = selection_to_pyexpr_list(exprs)
if more_exprs:
exprs.extend(selection_to_pyexpr_list(more_exprs))
if named_exprs:
exprs.extend(
expr_to_lit_or_expr(expr, name=name, str_to_lit=False)._pyexpr
for name, expr in named_exprs.items()
)
expr = wrap_expr(_as_struct(exprs))
if schema:
expr = expr.cast(Struct(schema), strict=False)
if eager:
return select(expr).to_series()
else:
return expr
@overload
def repeat(
value: float | int | str | bool | None,
n: Expr | int,
*,
eager: Literal[False] = ...,
name: str | None = ...,
) -> Expr:
...
@overload
def repeat(
value: float | int | str | bool | None,
n: Expr | int,
*,
eager: Literal[True],
name: str | None = ...,
) -> Series:
...
@overload
def repeat(
value: float | int | str | bool | None,
n: Expr | int,
*,
eager: bool,
name: str | None,
) -> Expr | Series:
...
def repeat(
value: float | int | str | bool | None,
n: Expr | int,
*,
eager: bool = False,
name: str | None = None,
) -> Expr | Series:
"""
Repeat a single value n times.
Parameters
----------
value
Value to repeat.
n
repeat `n` times
eager
Evaluate immediately and return a ``Series``. If set to ``False`` (default),
return an expression instead.
name
Only used in `eager` mode. As expression, use `alias`
"""
if eager:
if name is None:
name = ""
dtype = py_type_to_dtype(type(value))
if (
dtype == Int64
and isinstance(value, int)
and -(2**31) <= value <= 2**31 - 1
):
dtype = Int32
s = pl.Series._repeat(name, value, n, dtype) # type: ignore[arg-type]
return s
else:
if isinstance(n, int):
n = lit(n)
return wrap_expr(_repeat(value, n._pyexpr))
@overload
def arg_where(condition: Expr | Series, *, eager: Literal[False] = ...) -> Expr:
...
@overload
def arg_where(condition: Expr | Series, *, eager: Literal[True]) -> Series:
...
@overload
def arg_where(condition: Expr | Series, *, eager: bool) -> Expr | Series:
...
def arg_where(condition: Expr | Series, *, eager: bool = False) -> Expr | Series:
"""
Return indices where `condition` evaluates `True`.
Parameters
----------
condition
Boolean expression to evaluate
eager
Evaluate immediately and return a ``Series``. If set to ``False`` (default),
return an expression instead.
Examples
--------
>>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
>>> df.select(
... [
... pl.arg_where(pl.col("a") % 2 == 0),
... ]
... ).to_series()
shape: (2,)
Series: 'a' [u32]
[
1
3
]
See Also
--------
Series.arg_true : Return indices where Series is True
"""
if eager:
if not isinstance(condition, pl.Series):
raise ValueError(
"expected 'Series' in 'arg_where' if 'eager=True', got"
f" {type(condition)}"
)
return condition.to_frame().select(arg_where(col(condition.name))).to_series()
else:
condition = expr_to_lit_or_expr(condition, str_to_lit=True)
return wrap_expr(_arg_where(condition._pyexpr))
def coalesce(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr:
"""
Folds the columns from left to right, keeping the first non-null value.
Parameters
----------
exprs
Columns to coalesce. Accepts expression input. Strings are parsed as column
names, other non-expression inputs are parsed as literals.
*more_exprs
Additional columns to coalesce, specified as positional arguments.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [1, None, None, None],
... "b": [1, 2, None, None],
... "c": [5, None, 3, None],
... }
... )
>>> df.with_columns(pl.coalesce(["a", "b", "c", 10]).alias("d"))
shape: (4, 4)
┌──────┬──────┬──────┬─────┐
│ a ┆ b ┆ c ┆ d │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪═════╡
│ 1 ┆ 1 ┆ 5 ┆ 1 │
│ null ┆ 2 ┆ null ┆ 2 │
│ null ┆ null ┆ 3 ┆ 3 │
│ null ┆ null ┆ null ┆ 10 │
└──────┴──────┴──────┴─────┘
>>> df.with_columns(pl.coalesce(pl.col(["a", "b", "c"]), 10.0).alias("d"))
shape: (4, 4)
┌──────┬──────┬──────┬──────┐
│ a ┆ b ┆ c ┆ d │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ f64 │
╞══════╪══════╪══════╪══════╡
│ 1 ┆ 1 ┆ 5 ┆ 1.0 │
│ null ┆ 2 ┆ null ┆ 2.0 │
│ null ┆ null ┆ 3 ┆ 3.0 │
│ null ┆ null ┆ null ┆ 10.0 │
└──────┴──────┴──────┴──────┘
"""
exprs = selection_to_pyexpr_list(exprs)
if more_exprs:
exprs.extend(selection_to_pyexpr_list(more_exprs))
return wrap_expr(_coalesce(exprs))
@overload
def from_epoch(column: str | Expr, time_unit: EpochTimeUnit = ...) -> Expr:
...
@overload
def from_epoch(
column: Series | Sequence[int], time_unit: EpochTimeUnit = ...
) -> Series:
...
def from_epoch(
column: str | Expr | Series | Sequence[int], time_unit: EpochTimeUnit = "s"
) -> Expr | Series:
"""
Utility function that parses an epoch timestamp (or Unix time) to Polars Date(time).
Depending on the `time_unit` provided, this function will return a different dtype:
- time_unit="d" returns pl.Date
- time_unit="s" returns pl.Datetime["us"] (pl.Datetime's default)
- time_unit="ms" returns pl.Datetime["ms"]
- time_unit="us" returns pl.Datetime["us"]
- time_unit="ns" returns pl.Datetime["ns"]
Parameters
----------
column
Series or expression to parse integers to pl.Datetime.
time_unit
The unit of time of the timesteps since epoch time.
Examples
--------
>>> df = pl.DataFrame({"timestamp": [1666683077, 1666683099]}).lazy()
>>> df.select(pl.from_epoch(pl.col("timestamp"), time_unit="s")).collect()
shape: (2, 1)
┌─────────────────────┐
│ timestamp │
│ --- │
│ datetime[μs] │
╞═════════════════════╡
│ 2022-10-25 07:31:17 │
│ 2022-10-25 07:31:39 │
└─────────────────────┘
The function can also be used in an eager context by passing a Series.
>>> s = pl.Series([12345, 12346])
>>> pl.from_epoch(s, time_unit="d")
shape: (2,)
Series: '' [date]
[
2003-10-20
2003-10-21
]
"""
if isinstance(column, str):
column = col(column)
elif not isinstance(column, (pl.Series, pl.Expr)):
column = pl.Series(column) # Sequence input handled by Series constructor
if time_unit == "d":
return column.cast(Date)
elif time_unit == "s":
return (column.cast(Int64) * 1_000_000).cast(Datetime("us"))
elif time_unit in DTYPE_TEMPORAL_UNITS:
return column.cast(Datetime(time_unit))
else:
raise ValueError(
f"'time_unit' must be one of {{'ns', 'us', 'ms', 's', 'd'}}, got {time_unit!r}."
)