Source code for polars.functions.lazy

from __future__ import annotations

import contextlib
import warnings
from datetime import date, datetime, time, timedelta
from typing import TYPE_CHECKING, Any, Callable, Iterable, Sequence, overload

import polars._reexport as pl
from polars.datatypes import (
    DTYPE_TEMPORAL_UNITS,
    Date,
    Datetime,
    Duration,
    Int32,
    Int64,
    Struct,
    Time,
    UInt32,
    is_polars_dtype,
    py_type_to_dtype,
)
from polars.dependencies import _check_for_numpy
from polars.dependencies import numpy as np
from polars.utils._parse_expr_input import expr_to_lit_or_expr, selection_to_pyexpr_list
from polars.utils._wrap import wrap_df, wrap_expr
from polars.utils.convert import (
    _datetime_to_pl_timestamp,
    _time_to_pl_time,
    _timedelta_to_pl_timedelta,
)
from polars.utils.decorators import deprecated_alias
from polars.utils.various import find_stacklevel

with contextlib.suppress(ImportError):  # Module not available when building docs
    from polars.polars import arange as _arange
    from polars.polars import arg_sort_by as _arg_sort_by
    from polars.polars import arg_where as _arg_where
    from polars.polars import as_struct as _as_struct
    from polars.polars import coalesce as _coalesce
    from polars.polars import col as _col
    from polars.polars import collect_all as _collect_all
    from polars.polars import cols as _cols
    from polars.polars import concat_list as _concat_list
    from polars.polars import concat_str as _concat_str
    from polars.polars import count as _count
    from polars.polars import cov as _cov
    from polars.polars import cumfold as _cumfold
    from polars.polars import cumreduce as _cumreduce
    from polars.polars import datetime as _datetime
    from polars.polars import dtype_cols as _dtype_cols
    from polars.polars import duration as _duration
    from polars.polars import first as _first
    from polars.polars import fold as _fold
    from polars.polars import last as _last
    from polars.polars import lit as _lit
    from polars.polars import map_mul as _map_mul
    from polars.polars import max_exprs as _max_exprs
    from polars.polars import min_exprs as _min_exprs
    from polars.polars import pearson_corr as _pearson_corr
    from polars.polars import reduce as _reduce
    from polars.polars import repeat as _repeat
    from polars.polars import spearman_rank_corr as _spearman_rank_corr
    from polars.polars import sum_exprs as _sum_exprs


if TYPE_CHECKING:
    import sys

    from polars.dataframe import DataFrame
    from polars.expr.expr import Expr
    from polars.lazyframe import LazyFrame
    from polars.series import Series
    from polars.type_aliases import (
        CorrelationMethod,
        EpochTimeUnit,
        IntoExpr,
        PolarsDataType,
        PythonLiteral,
        RollingInterpolationMethod,
        SchemaDict,
        TimeUnit,
    )

    if sys.version_info >= (3, 8):
        from typing import Literal
    else:
        from typing_extensions import Literal


[docs]def col(
    name: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType],
    *more_names: str | PolarsDataType,
) -> Expr:
    """
    Return an expression representing column(s) in a dataframe.

    Parameters
    ----------
    name
        The name or datatype of the column(s) to represent. Accepts regular expression
        input. Regular expressions should start with ``^`` and end with ``$``.
    *more_names
        Additional names or datatypes of columns to represent, specified as positional
        arguments.

    Examples
    --------
    Pass a single column name to represent that column.

    >>> df = pl.DataFrame(
    ...     {
    ...         "ham": [1, 2, 3],
    ...         "hamburger": [11, 22, 33],
    ...         "foo": [3, 2, 1],
    ...         "bar": ["a", "b", "c"],
    ...     }
    ... )
    >>> df.select(pl.col("foo"))
    shape: (3, 1)
    ┌─────┐
    │ foo │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 3   │
    │ 2   │
    │ 1   │
    └─────┘

    Use the wildcard ``*`` to represent all columns.

    >>> df.select(pl.col("*"))
    shape: (3, 4)
    ┌─────┬───────────┬─────┬─────┐
    │ ham ┆ hamburger ┆ foo ┆ bar │
    │ --- ┆ ---       ┆ --- ┆ --- │
    │ i64 ┆ i64       ┆ i64 ┆ str │
    ╞═════╪═══════════╪═════╪═════╡
    │ 1   ┆ 11        ┆ 3   ┆ a   │
    │ 2   ┆ 22        ┆ 2   ┆ b   │
    │ 3   ┆ 33        ┆ 1   ┆ c   │
    └─────┴───────────┴─────┴─────┘
    >>> df.select(pl.col("*").exclude("ham"))
    shape: (3, 3)
    ┌───────────┬─────┬─────┐
    │ hamburger ┆ foo ┆ bar │
    │ ---       ┆ --- ┆ --- │
    │ i64       ┆ i64 ┆ str │
    ╞═══════════╪═════╪═════╡
    │ 11        ┆ 3   ┆ a   │
    │ 22        ┆ 2   ┆ b   │
    │ 33        ┆ 1   ┆ c   │
    └───────────┴─────┴─────┘

    Regular expression input is supported.

    >>> df.select(pl.col("^ham.*$"))
    shape: (3, 2)
    ┌─────┬───────────┐
    │ ham ┆ hamburger │
    │ --- ┆ ---       │
    │ i64 ┆ i64       │
    ╞═════╪═══════════╡
    │ 1   ┆ 11        │
    │ 2   ┆ 22        │
    │ 3   ┆ 33        │
    └─────┴───────────┘

    Multiple columns can be represented by passing a list of names.

    >>> df.select(pl.col(["hamburger", "foo"]))
    shape: (3, 2)
    ┌───────────┬─────┐
    │ hamburger ┆ foo │
    │ ---       ┆ --- │
    │ i64       ┆ i64 │
    ╞═══════════╪═════╡
    │ 11        ┆ 3   │
    │ 22        ┆ 2   │
    │ 33        ┆ 1   │
    └───────────┴─────┘

    Or use positional arguments to represent multiple columns in the same way.

    >>> df.select(pl.col("hamburger", "foo"))
    shape: (3, 2)
    ┌───────────┬─────┐
    │ hamburger ┆ foo │
    │ ---       ┆ --- │
    │ i64       ┆ i64 │
    ╞═══════════╪═════╡
    │ 11        ┆ 3   │
    │ 22        ┆ 2   │
    │ 33        ┆ 1   │
    └───────────┴─────┘

    Easily select all columns that match a certain data type by passing that datatype.

    >>> df.select(pl.col(pl.Utf8))
    shape: (3, 1)
    ┌─────┐
    │ bar │
    │ --- │
    │ str │
    ╞═════╡
    │ a   │
    │ b   │
    │ c   │
    └─────┘
    >>> df.select(pl.col(pl.Int64, pl.Float64))
    shape: (3, 3)
    ┌─────┬───────────┬─────┐
    │ ham ┆ hamburger ┆ foo │
    │ --- ┆ ---       ┆ --- │
    │ i64 ┆ i64       ┆ i64 │
    ╞═════╪═══════════╪═════╡
    │ 1   ┆ 11        ┆ 3   │
    │ 2   ┆ 22        ┆ 2   │
    │ 3   ┆ 33        ┆ 1   │
    └─────┴───────────┴─────┘

    """
    if more_names:
        if isinstance(name, str):
            names_str = [name]
            names_str.extend(more_names)  # type: ignore[arg-type]
            return wrap_expr(_cols(names_str))
        elif is_polars_dtype(name):
            dtypes = [name]
            dtypes.extend(more_names)
            return wrap_expr(_dtype_cols(dtypes))
        else:
            raise TypeError(
                f"Invalid input for `col`. Expected `str` or `DataType`, got {type(name)!r}"
            )

    if isinstance(name, str):
        return wrap_expr(_col(name))
    elif is_polars_dtype(name):
        return wrap_expr(_dtype_cols([name]))
    elif isinstance(name, Iterable):
        names = list(name)
        if not names:
            return wrap_expr(_cols(names))

        item = names[0]
        if isinstance(item, str):
            return wrap_expr(_cols(names))
        elif is_polars_dtype(item):
            return wrap_expr(_dtype_cols(names))
        else:
            raise TypeError(
                "Invalid input for `col`. Expected iterable of type `str` or `DataType`,"
                f" got iterable of type {type(item)!r}"
            )
    else:
        raise TypeError(
            f"Invalid input for `col`. Expected `str` or `DataType`, got {type(name)!r}"
        )


def element() -> Expr:
    """
    Alias for an element being evaluated in an `eval` expression.

    Examples
    --------
    A horizontal rank computation by taking the elements of a list

    >>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2]})
    >>> df.with_columns(
    ...     pl.concat_list(["a", "b"]).arr.eval(pl.element().rank()).alias("rank")
    ... )
    shape: (3, 3)
    ┌─────┬─────┬────────────┐
    │ a   ┆ b   ┆ rank       │
    │ --- ┆ --- ┆ ---        │
    │ i64 ┆ i64 ┆ list[f32]  │
    ╞═════╪═════╪════════════╡
    │ 1   ┆ 4   ┆ [1.0, 2.0] │
    │ 8   ┆ 5   ┆ [2.0, 1.0] │
    │ 3   ┆ 2   ┆ [2.0, 1.0] │
    └─────┴─────┴────────────┘

    A mathematical operation on array elements

    >>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2]})
    >>> df.with_columns(
    ...     pl.concat_list(["a", "b"]).arr.eval(pl.element() * 2).alias("a_b_doubled")
    ... )
    shape: (3, 3)
    ┌─────┬─────┬─────────────┐
    │ a   ┆ b   ┆ a_b_doubled │
    │ --- ┆ --- ┆ ---         │
    │ i64 ┆ i64 ┆ list[i64]   │
    ╞═════╪═════╪═════════════╡
    │ 1   ┆ 4   ┆ [2, 8]      │
    │ 8   ┆ 5   ┆ [16, 10]    │
    │ 3   ┆ 2   ┆ [6, 4]      │
    └─────┴─────┴─────────────┘

    """
    return col("")


@overload
def count(column: str) -> Expr:
    ...


@overload
def count(column: Series) -> int:
    ...


@overload
def count(column: None = None) -> Expr:
    ...


def count(column: str | Series | None = None) -> Expr | int:
    """
    Count the number of values in this column/context.

    Parameters
    ----------
    column
        If dtype is:

        * ``pl.Series`` : count the values in the series.
        * ``str`` : count the values in this column.
        * ``None`` : count the number of values in this context.

    Examples
    --------
    >>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
    >>> df.select(pl.count())
    shape: (1, 1)
    ┌───────┐
    │ count │
    │ ---   │
    │ u32   │
    ╞═══════╡
    │ 3     │
    └───────┘
    >>> df.groupby("c", maintain_order=True).agg(pl.count())
    shape: (2, 2)
    ┌─────┬───────┐
    │ c   ┆ count │
    │ --- ┆ ---   │
    │ str ┆ u32   │
    ╞═════╪═══════╡
    │ foo ┆ 2     │
    │ bar ┆ 1     │
    └─────┴───────┘

    """
    if column is None:
        return wrap_expr(_count())

    if isinstance(column, pl.Series):
        return column.len()
    return col(column).count()


def implode(name: str) -> Expr:
    """
    Aggregate all column values into a list.

    Parameters
    ----------
    name
        Name of the column that should be imploded.

    """
    return col(name).implode()


def list_(name: str) -> Expr:
    """
    Aggregate to list.

    .. deprecated:: 0.17.3
        ``list`` will be removed in favor of ``implode``.

    Parameters
    ----------
    name
        Name of the column that should be aggregated into a list.

    """
    warnings.warn(
        "`pl.list` is deprecated, please use `pl.implode` instead.",
        DeprecationWarning,
        stacklevel=find_stacklevel(),
    )
    return col(name).implode()


@overload
def std(column: str, ddof: int = 1) -> Expr:
    ...


@overload
def std(column: Series, ddof: int = 1) -> float | None:
    ...


def std(column: str | Series, ddof: int = 1) -> Expr | float | None:
    """
    Get the standard deviation.

    Parameters
    ----------
    column
        Column to get the standard deviation from.
    ddof
        “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
        where N represents the number of elements.
        By default ddof is 1.

    Examples
    --------
    >>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
    >>> df.select(pl.std("a"))
    shape: (1, 1)
    ┌──────────┐
    │ a        │
    │ ---      │
    │ f64      │
    ╞══════════╡
    │ 3.605551 │
    └──────────┘
    >>> df["a"].std()
    3.605551275463989

    """
    if isinstance(column, pl.Series):
        return column.std(ddof)
    return col(column).std(ddof)


@overload
def var(column: str, ddof: int = 1) -> Expr:
    ...


@overload
def var(column: Series, ddof: int = 1) -> float | None:
    ...


def var(column: str | Series, ddof: int = 1) -> Expr | float | None:
    """
    Get the variance.

    Parameters
    ----------
    column
        Column to get the variance of.
    ddof
        “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
        where N represents the number of elements.
        By default ddof is 1.

    Examples
    --------
    >>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
    >>> df.select(pl.var("a"))
    shape: (1, 1)
    ┌──────┐
    │ a    │
    │ ---  │
    │ f64  │
    ╞══════╡
    │ 13.0 │
    └──────┘
    >>> df["a"].var()
    13.0

    """
    if isinstance(column, pl.Series):
        return column.var(ddof)
    return col(column).var(ddof)


@overload
def max(exprs: Series) -> PythonLiteral | None:  # type: ignore[misc]
    ...


@overload
def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr:
    ...


def max(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | Any:
    """
    Get the maximum value.

    If a single string is passed, this is an alias for ``pl.col(name).max()``.
    If a single Series is passed, this is an alias for ``Series.max()``.

    Otherwise, this function computes the maximum value horizontally across multiple
    columns.

    Parameters
    ----------
    exprs
        Column(s) to use in the aggregation. Accepts expression input. Strings are
        parsed as column names, other non-expression inputs are parsed as literals.
    *more_exprs
        Additional columns to use in the aggregation, specified as positional arguments.

    Examples
    --------
    Get the maximum value by row by passing multiple columns/expressions.

    >>> df = pl.DataFrame(
    ...     {
    ...         "a": [1, 8, 3],
    ...         "b": [4, 5, 2],
    ...         "c": ["foo", "bar", "foo"],
    ...     }
    ... )
    >>> df.select(pl.max("a", "b"))
    shape: (3, 1)
    ┌─────┐
    │ max │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 4   │
    │ 8   │
    │ 3   │
    └─────┘

    Get the maximum value of a column by passing a single column name.

    >>> df.select(pl.max("a"))
    shape: (1, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 8   │
    └─────┘

    Get column-wise maximums for multiple columns by passing a regular expression,
    or call ``.max()`` on a multi-column expression instead.

    >>> df.select(pl.max("^a|b$"))
    shape: (1, 2)
    ┌─────┬─────┐
    │ a   ┆ b   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 8   ┆ 5   │
    └─────┴─────┘
    >>> df.select(pl.col("a", "b").max())
    shape: (1, 2)
    ┌─────┬─────┐
    │ a   ┆ b   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 8   ┆ 5   │
    └─────┴─────┘

    """
    if not more_exprs:
        if isinstance(exprs, pl.Series):
            return exprs.max()
        elif isinstance(exprs, str):
            return col(exprs).max()

    exprs = selection_to_pyexpr_list(exprs)
    if more_exprs:
        exprs.extend(selection_to_pyexpr_list(more_exprs))
    return wrap_expr(_max_exprs(exprs))


@overload
def min(exprs: Series) -> PythonLiteral | None:  # type: ignore[misc]
    ...


@overload
def min(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr:
    ...


def min(
    exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr
) -> Expr | PythonLiteral | None:
    """
    Get the minimum value.

    If a single string is passed, this is an alias for ``pl.col(name).min()``.
    If a single Series is passed, this is an alias for ``Series.min()``.

    Otherwise, this function computes the minimum value horizontally across multiple
    columns.

    Parameters
    ----------
    exprs
        Column(s) to use in the aggregation. Accepts expression input. Strings are
        parsed as column names, other non-expression inputs are parsed as literals.
    *more_exprs
        Additional columns to use in the aggregation, specified as positional arguments.

    Examples
    --------
    Get the minimum value by row by passing multiple columns/expressions.

    >>> df = pl.DataFrame(
    ...     {
    ...         "a": [1, 8, 3],
    ...         "b": [4, 5, 2],
    ...         "c": ["foo", "bar", "foo"],
    ...     }
    ... )
    >>> df.select(pl.min("a", "b"))
    shape: (3, 1)
    ┌─────┐
    │ min │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 1   │
    │ 5   │
    │ 2   │
    └─────┘

    Get the minimum value of a column by passing a single column name.

    >>> df.select(pl.min("a"))
    shape: (1, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 1   │
    └─────┘

    Get column-wise minimums for multiple columns by passing a regular expression,
    or call ``.min()`` on a multi-column expression instead.

    >>> df.select(pl.min("^a|b$"))
    shape: (1, 2)
    ┌─────┬─────┐
    │ a   ┆ b   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 1   ┆ 2   │
    └─────┴─────┘
    >>> df.select(pl.col("a", "b").min())
    shape: (1, 2)
    ┌─────┬─────┐
    │ a   ┆ b   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 1   ┆ 2   │
    └─────┴─────┘

    """
    if not more_exprs:
        if isinstance(exprs, pl.Series):
            return exprs.min()
        elif isinstance(exprs, str):
            return col(exprs).min()

    exprs = selection_to_pyexpr_list(exprs)
    if more_exprs:
        exprs.extend(selection_to_pyexpr_list(more_exprs))
    return wrap_expr(_min_exprs(exprs))


@overload
def sum(exprs: Series) -> int | float:  # type: ignore[misc]
    ...


@overload
def sum(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr:
    ...


@deprecated_alias(column="exprs")
def sum(
    exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr
) -> Expr | int | float:
    """
    Sum all values.

    If a single string is passed, this is an alias for ``pl.col(name).sum()``.
    If a single Series is passed, this is an alias for ``Series.sum()``.

    Otherwise, this function computes the sum horizontally across multiple columns.

    Parameters
    ----------
    exprs
        Column(s) to use in the aggregation. Accepts expression input. Strings are
        parsed as column names, other non-expression inputs are parsed as literals.
    *more_exprs
        Additional columns to use in the aggregation, specified as positional arguments.

    Examples
    --------
    >>> df = pl.DataFrame(
    ...     {
    ...         "a": [1, 2],
    ...         "b": [3, 4],
    ...         "c": [5, 6],
    ...     }
    ... )
    >>> df
    shape: (2, 3)
    ┌─────┬─────┬─────┐
    │ a   ┆ b   ┆ c   │
    │ --- ┆ --- ┆ --- │
    │ i64 ┆ i64 ┆ i64 │
    ╞═════╪═════╪═════╡
    │ 1   ┆ 3   ┆ 5   │
    │ 2   ┆ 4   ┆ 6   │
    └─────┴─────┴─────┘

    Sum a column by name:

    >>> df.select(pl.sum("a"))
    shape: (1, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 3   │
    └─────┘

    Sum a list of columns/expressions horizontally:

    >>> df.with_columns(pl.sum("a", "c"))
    shape: (2, 4)
    ┌─────┬─────┬─────┬─────┐
    │ a   ┆ b   ┆ c   ┆ sum │
    │ --- ┆ --- ┆ --- ┆ --- │
    │ i64 ┆ i64 ┆ i64 ┆ i64 │
    ╞═════╪═════╪═════╪═════╡
    │ 1   ┆ 3   ┆ 5   ┆ 6   │
    │ 2   ┆ 4   ┆ 6   ┆ 8   │
    └─────┴─────┴─────┴─────┘

    Sum a series:

    >>> pl.sum(df.get_column("a"))
    3

    To aggregate the sums for more than one column/expression use ``pl.col(list).sum()``
    or a regular expression selector like ``pl.sum(regex)``:

    >>> df.select(pl.col("a", "c").sum())
    shape: (1, 2)
    ┌─────┬─────┐
    │ a   ┆ c   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 3   ┆ 11  │
    └─────┴─────┘

    >>> df.select(pl.sum("^.*[bc]$"))
    shape: (1, 2)
    ┌─────┬─────┐
    │ b   ┆ c   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 7   ┆ 11  │
    └─────┴─────┘

    """
    if not more_exprs:
        if isinstance(exprs, pl.Series):
            return exprs.sum()
        elif isinstance(exprs, str):
            return col(exprs).sum()

    exprs = selection_to_pyexpr_list(exprs)
    if more_exprs:
        exprs.extend(selection_to_pyexpr_list(more_exprs))
    return wrap_expr(_sum_exprs(exprs))


@overload
def mean(column: str) -> Expr:
    ...


@overload
def mean(column: Series) -> float:
    ...


def mean(column: str | Series) -> Expr | float | None:
    """
    Get the mean value.

    Examples
    --------
    >>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
    >>> df.select(pl.mean("a"))
    shape: (1, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ f64 │
    ╞═════╡
    │ 4.0 │
    └─────┘
    >>> pl.mean(df["a"])
    4.0

    """
    if isinstance(column, pl.Series):
        return column.mean()
    return col(column).mean()


@overload
def avg(column: str) -> Expr:
    ...


@overload
def avg(column: Series) -> float:
    ...


def avg(column: str | Series) -> Expr | float:
    """
    Alias for mean.

    Examples
    --------
    >>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
    >>> df.select(pl.avg("a"))
    shape: (1, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ f64 │
    ╞═════╡
    │ 4.0 │
    └─────┘
    >>> pl.avg(df["a"])
    4.0

    """
    return mean(column)


@overload
def median(column: str) -> Expr:
    ...


@overload
def median(column: Series) -> float | int:
    ...


def median(column: str | Series) -> Expr | float | int | None:
    """
    Get the median value.

    Examples
    --------
    >>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
    >>> df.select(pl.median("a"))
    shape: (1, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ f64 │
    ╞═════╡
    │ 3.0 │
    └─────┘
    >>> pl.median(df["a"])
    3.0

    """
    if isinstance(column, pl.Series):
        return column.median()
    return col(column).median()


@overload
def n_unique(column: str) -> Expr:
    ...


@overload
def n_unique(column: Series) -> int:
    ...


def n_unique(column: str | Series) -> Expr | int:
    """
    Count unique values.

    Examples
    --------
    >>> df = pl.DataFrame({"a": [1, 8, 1], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
    >>> df.select(pl.n_unique("a"))
    shape: (1, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ u32 │
    ╞═════╡
    │ 2   │
    └─────┘
    >>> pl.n_unique(df["a"])
    2

    """
    if isinstance(column, pl.Series):
        return column.n_unique()
    return col(column).n_unique()


def approx_unique(column: str | Expr) -> Expr:
    """
    Approx count unique values.

    This is done using the HyperLogLog++ algorithm for cardinality estimation.

    Parameters
    ----------
    column
        Column name or Series.

    Examples
    --------
    >>> df = pl.DataFrame({"a": [1, 8, 1], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
    >>> df.select(pl.approx_unique("a"))
    shape: (1, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ u32 │
    ╞═════╡
    │ 2   │
    └─────┘

    """
    if isinstance(column, pl.Expr):
        return column.approx_unique()
    return col(column).approx_unique()


@overload
def first(column: str) -> Expr:
    ...


@overload
def first(column: Series) -> Any:
    ...


@overload
def first(column: None = None) -> Expr:
    ...


def first(column: str | Series | None = None) -> Expr | Any:
    """
    Get the first value.

    Depending on the input type this function does different things:

    input:

    - None -> expression to take first column of a context.
    - str -> syntactic sugar for `pl.col(..).first()`
    - Series -> Take first value in `Series`

    Examples
    --------
    >>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
    >>> df.select(pl.first())
    shape: (3, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 1   │
    │ 8   │
    │ 3   │
    └─────┘
    >>> df.select(pl.first("a"))
    shape: (1, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 1   │
    └─────┘
    >>> pl.first(df["a"])
    1

    """
    if column is None:
        return wrap_expr(_first())

    if isinstance(column, pl.Series):
        if column.len() > 0:
            return column[0]
        else:
            raise IndexError("The series is empty, so no first value can be returned.")
    return col(column).first()


@overload
def last(column: str) -> Expr:
    ...


@overload
def last(column: Series) -> Any:
    ...


@overload
def last(column: None = None) -> Expr:
    ...


def last(column: str | Series | None = None) -> Expr:
    """
    Get the last value.

    Depending on the input type this function does different things:

    - None -> expression to take last column of a context.
    - str -> syntactic sugar for `pl.col(..).last()`
    - Series -> Take last value in `Series`

    Examples
    --------
    >>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
    >>> df.select(pl.last())
    shape: (3, 1)
    ┌─────┐
    │ c   │
    │ --- │
    │ str │
    ╞═════╡
    │ foo │
    │ bar │
    │ foo │
    └─────┘
    >>> df.select(pl.last("a"))
    shape: (1, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 3   │
    └─────┘
    >>> pl.last(df["a"])
    3

    """
    if column is None:
        return wrap_expr(_last())

    if isinstance(column, pl.Series):
        if column.len() > 0:
            return column[-1]
        else:
            raise IndexError("The series is empty, so no last value can be returned,")
    return col(column).last()


@overload
def head(column: str, n: int = ...) -> Expr:
    ...


@overload
def head(column: Series, n: int = ...) -> Series:
    ...


def head(column: str | Series, n: int = 10) -> Expr | Series:
    """
    Get the first `n` rows.

    Parameters
    ----------
    column
        Column name or Series.
    n
        Number of rows to return.

    Examples
    --------
    >>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
    >>> df.select(pl.head("a"))
    shape: (3, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 1   │
    │ 8   │
    │ 3   │
    └─────┘
    >>> df.select(pl.head("a", 2))
    shape: (2, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 1   │
    │ 8   │
    └─────┘
    >>> pl.head(df["a"], 2)
    shape: (2,)
    Series: 'a' [i64]
    [
        1
        8
    ]

    """
    if isinstance(column, pl.Series):
        return column.head(n)
    return col(column).head(n)


@overload
def tail(column: str, n: int = ...) -> Expr:
    ...


@overload
def tail(column: Series, n: int = ...) -> Series:
    ...


def tail(column: str | Series, n: int = 10) -> Expr | Series:
    """
    Get the last `n` rows.

    Parameters
    ----------
    column
        Column name or Series.
    n
        Number of rows to return.

    Examples
    --------
    >>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
    >>> df.select(pl.tail("a"))
    shape: (3, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 1   │
    │ 8   │
    │ 3   │
    └─────┘
    >>> df.select(pl.tail("a", 2))
    shape: (2, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 8   │
    │ 3   │
    └─────┘
    >>> pl.tail(df["a"], 2)
    shape: (2,)
    Series: 'a' [i64]
    [
        8
        3
    ]

    """
    if isinstance(column, pl.Series):
        return column.tail(n)
    return col(column).tail(n)


[docs]def lit(
    value: Any, dtype: PolarsDataType | None = None, *, allow_object: bool = False
) -> Expr:
    """
    Return an expression representing a literal value.

    Parameters
    ----------
    value
        Value that should be used as a `literal`.
    dtype
        Optionally define a dtype.
    allow_object
        If type is unknown use an 'object' type.
        By default, we will raise a `ValueException`
        if the type is unknown.

    Examples
    --------
    Literal scalar values:

    >>> pl.lit(1)  # doctest: +IGNORE_RESULT
    >>> pl.lit(5.5)  # doctest: +IGNORE_RESULT
    >>> pl.lit(None)  # doctest: +IGNORE_RESULT
    >>> pl.lit("foo_bar")  # doctest: +IGNORE_RESULT
    >>> pl.lit(date(2021, 1, 20))  # doctest: +IGNORE_RESULT
    >>> pl.lit(datetime(2023, 3, 31, 10, 30, 45))  # doctest: +IGNORE_RESULT

    Literal list/Series data (1D):

    >>> pl.lit([1, 2, 3])  # doctest: +IGNORE_RESULT
    >>> pl.lit(pl.Series("x", [1, 2, 3]))  # doctest: +IGNORE_RESULT

    Literal list/Series data (2D):

    >>> pl.lit([[1, 2], [3, 4]])  # doctest: +IGNORE_RESULT
    >>> pl.lit(pl.Series("y", [[1, 2], [3, 4]]))  # doctest: +IGNORE_RESULT

    Expected datatypes

    - ''pl.lit([])'' -> empty  Series Float32
    - ''pl.lit([1, 2, 3])'' -> Series Int64
    - ''pl.lit([[]])''-> empty  Series List<Null>
    - ''pl.lit([[1, 2, 3]])'' -> Series List<i64>
    - ''pl.lit(None)'' -> Series Null

    """
    time_unit: TimeUnit

    if isinstance(value, datetime):
        time_unit = "us" if dtype is None else getattr(dtype, "time_unit", "us")
        time_zone = (
            value.tzinfo
            if getattr(dtype, "time_zone", None) is None
            else getattr(dtype, "time_zone", None)
        )
        if value.tzinfo is not None and getattr(dtype, "time_zone", None) is not None:
            raise TypeError(
                "Cannot cast tz-aware value to tz-aware dtype. "
                "Please drop the time zone from the dtype."
            )
        e = lit(_datetime_to_pl_timestamp(value, time_unit)).cast(Datetime(time_unit))
        if time_zone is not None:
            return e.dt.replace_time_zone(str(time_zone))
        else:
            return e

    elif isinstance(value, timedelta):
        time_unit = "us" if dtype is None else getattr(dtype, "time_unit", "us")
        return lit(_timedelta_to_pl_timedelta(value, time_unit)).cast(
            Duration(time_unit)
        )

    elif isinstance(value, time):
        return lit(_time_to_pl_time(value)).cast(Time)

    elif isinstance(value, date):
        return lit(datetime(value.year, value.month, value.day)).cast(Date)

    elif isinstance(value, pl.Series):
        name = value.name
        value = value._s
        e = wrap_expr(_lit(value, allow_object))
        if name == "":
            return e
        return e.alias(name)

    elif (_check_for_numpy(value) and isinstance(value, np.ndarray)) or isinstance(
        value, (list, tuple)
    ):
        return lit(pl.Series("", value))

    elif dtype:
        return wrap_expr(_lit(value, allow_object)).cast(dtype)

    try:
        # numpy literals like np.float32(0) have item/dtype
        item = value.item()

        # numpy item() is py-native datetime/timedelta when units < 'ns'
        if isinstance(item, (datetime, timedelta)):
            return lit(item)

        # handle 'ns' units
        if isinstance(item, int) and hasattr(value, "dtype"):
            dtype_name = value.dtype.name
            if dtype_name.startswith(("datetime64[", "timedelta64[")):
                time_unit = dtype_name[11:-1]
                return lit(item).cast(
                    Datetime(time_unit)
                    if dtype_name.startswith("date")
                    else Duration(time_unit)
                )

    except AttributeError:
        item = value
    return wrap_expr(_lit(item, allow_object))


@overload
def cumsum(exprs: Series) -> Series:  # type: ignore[misc]
    ...


@overload
def cumsum(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr:
    ...


@deprecated_alias(column="exprs")
def cumsum(
    exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr
) -> Expr | Series:
    """
    Cumulatively sum all values.

    If a single string is passed, this is an alias for ``pl.col(name).cumsum()``.
    If a single Series is passed, this is an alias for ``Series.cumsum()``.

    Otherwise, this function computes the cumulative sum horizontally across multiple
    columns.

    Parameters
    ----------
    exprs
        Column(s) to use in the aggregation. Accepts expression input. Strings are
        parsed as column names, other non-expression inputs are parsed as literals.
    *more_exprs
        Additional columns to use in the aggregation, specified as positional arguments.

    Examples
    --------
    >>> df = pl.DataFrame(
    ...     {
    ...         "a": [1, 2],
    ...         "b": [3, 4],
    ...         "c": [5, 6],
    ...     }
    ... )
    >>> df
    shape: (2, 3)
    ┌─────┬─────┬─────┐
    │ a   ┆ b   ┆ c   │
    │ --- ┆ --- ┆ --- │
    │ i64 ┆ i64 ┆ i64 │
    ╞═════╪═════╪═════╡
    │ 1   ┆ 3   ┆ 5   │
    │ 2   ┆ 4   ┆ 6   │
    └─────┴─────┴─────┘

    Cumulatively sum a column by name:

    >>> df.select(pl.cumsum("a"))
    shape: (2, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 1   │
    │ 3   │
    └─────┘

    Cumulatively sum a list of columns/expressions horizontally:

    >>> df.with_columns(pl.cumsum("a", "c"))
    shape: (2, 4)
    ┌─────┬─────┬─────┬───────────┐
    │ a   ┆ b   ┆ c   ┆ cumsum    │
    │ --- ┆ --- ┆ --- ┆ ---       │
    │ i64 ┆ i64 ┆ i64 ┆ struct[2] │
    ╞═════╪═════╪═════╪═══════════╡
    │ 1   ┆ 3   ┆ 5   ┆ {1,6}     │
    │ 2   ┆ 4   ┆ 6   ┆ {2,8}     │
    └─────┴─────┴─────┴───────────┘

    """
    if not more_exprs:
        if isinstance(exprs, pl.Series):
            return exprs.cumsum()
        elif isinstance(exprs, str):
            return col(exprs).cumsum()

    exprs = selection_to_pyexpr_list(exprs)
    if more_exprs:
        exprs.extend(selection_to_pyexpr_list(more_exprs))

    # (Expr): use u32 as that will not cast to float as eagerly
    exprs_wrapped = [wrap_expr(e) for e in exprs]
    return cumfold(lit(0).cast(UInt32), lambda a, b: a + b, exprs_wrapped).alias(
        "cumsum"
    )


def spearman_rank_corr(
    a: str | Expr, b: str | Expr, ddof: int = 1, *, propagate_nans: bool = False
) -> Expr:
    """
    Compute the spearman rank correlation between two columns.

    Missing data will be excluded from the computation.

    .. deprecated:: 0.16.10
        ``spearman_rank_corr`` will be removed in favor of
        ``corr(..., method="spearman")``.

    Parameters
    ----------
    a
        Column name or Expression.
    b
        Column name or Expression.
    ddof
        “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
        where N represents the number of elements.
        By default ddof is 1.
    propagate_nans
        If `True` any `NaN` encountered will lead to `NaN` in the output.
        Defaults to `False` where `NaN` are regarded as larger than any finite number
        and thus lead to the highest rank.

    See Also
    --------
    corr

    Examples
    --------
    >>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
    >>> df.select(pl.spearman_rank_corr("a", "b"))  # doctest: +SKIP
    shape: (1, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ f64 │
    ╞═════╡
    │ 0.5 │
    └─────┘
    """
    warnings.warn(
        "`spearman_rank_corr()` is deprecated in favor of `corr()`",
        DeprecationWarning,
        stacklevel=find_stacklevel(),
    )
    if isinstance(a, str):
        a = col(a)
    if isinstance(b, str):
        b = col(b)
    return wrap_expr(_spearman_rank_corr(a._pyexpr, b._pyexpr, ddof, propagate_nans))


def pearson_corr(a: str | Expr, b: str | Expr, ddof: int = 1) -> Expr:
    """
    Compute the pearson's correlation between two columns.

    .. deprecated:: 0.16.10
        ``pearson_corr`` will be removed in favor of ``corr(..., method="pearson")``.

    Parameters
    ----------
    a
        Column name or Expression.
    b
        Column name or Expression.
    ddof
        “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
        where N represents the number of elements.
        By default ddof is 1.

    See Also
    --------
    corr

    Examples
    --------
    >>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
    >>> df.select(pl.pearson_corr("a", "b"))  # doctest: +SKIP
    shape: (1, 1)
    ┌──────────┐
    │ a        │
    │ ---      │
    │ f64      │
    ╞══════════╡
    │ 0.544705 │
    └──────────┘
    """
    warnings.warn(
        "`pearson_corr()` is deprecated in favor of `corr()`",
        DeprecationWarning,
        stacklevel=find_stacklevel(),
    )
    if isinstance(a, str):
        a = col(a)
    if isinstance(b, str):
        b = col(b)
    return wrap_expr(_pearson_corr(a._pyexpr, b._pyexpr, ddof))


def corr(
    a: str | Expr,
    b: str | Expr,
    *,
    method: CorrelationMethod = "pearson",
    ddof: int = 1,
    propagate_nans: bool = False,
) -> Expr:
    """
    Compute the pearson's or spearman rank correlation correlation between two columns.

    Parameters
    ----------
    a
        Column name or Expression.
    b
        Column name or Expression.
    ddof
        “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
        where N represents the number of elements.
        By default ddof is 1.
    method : {'pearson', 'spearman'}
        Correlation method.
    propagate_nans
        If `True` any `NaN` encountered will lead to `NaN` in the output.
        Defaults to `False` where `NaN` are regarded as larger than any finite number
        and thus lead to the highest rank.

    Examples
    --------
    Pearson's correlation:

    >>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
    >>> df.select(pl.corr("a", "b"))
    shape: (1, 1)
    ┌──────────┐
    │ a        │
    │ ---      │
    │ f64      │
    ╞══════════╡
    │ 0.544705 │
    └──────────┘

    Spearman rank correlation:

    >>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
    >>> df.select(pl.corr("a", "b", method="spearman"))
    shape: (1, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ f64 │
    ╞═════╡
    │ 0.5 │
    └─────┘
    """
    if isinstance(a, str):
        a = col(a)
    if isinstance(b, str):
        b = col(b)

    if method == "pearson":
        return wrap_expr(_pearson_corr(a._pyexpr, b._pyexpr, ddof))
    elif method == "spearman":
        return wrap_expr(
            _spearman_rank_corr(a._pyexpr, b._pyexpr, ddof, propagate_nans)
        )
    else:
        raise ValueError(
            f"method must be one of {{'pearson', 'spearman'}}, got {method!r}"
        )


def cov(a: str | Expr, b: str | Expr) -> Expr:
    """
    Compute the covariance between two columns/ expressions.

    Parameters
    ----------
    a
        Column name or Expression.
    b
        Column name or Expression.

    Examples
    --------
    >>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2], "c": ["foo", "bar", "foo"]})
    >>> df.select(pl.cov("a", "b"))
    shape: (1, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ f64 │
    ╞═════╡
    │ 3.0 │
    └─────┘

    """
    if isinstance(a, str):
        a = col(a)
    if isinstance(b, str):
        b = col(b)
    return wrap_expr(_cov(a._pyexpr, b._pyexpr))


def map(
    exprs: Sequence[str] | Sequence[Expr],
    function: Callable[[Sequence[Series]], Series],
    return_dtype: PolarsDataType | None = None,
) -> Expr:
    """
    Map a custom function over multiple columns/expressions.

    Produces a single Series result.

    Parameters
    ----------
    exprs
        Input Series to f
    function
        Function to apply over the input
    return_dtype
        dtype of the output Series

    Returns
    -------
    Expr

    Examples
    --------
    >>> def test_func(a, b, c):
    ...     return a + b + c
    ...
    >>> df = pl.DataFrame(
    ...     {
    ...         "a": [1, 2, 3, 4],
    ...         "b": [4, 5, 6, 7],
    ...     }
    ... )
    >>>
    >>> df.with_columns(
    ...     (
    ...         pl.struct(["a", "b"]).map(
    ...             lambda x: test_func(x.struct.field("a"), x.struct.field("b"), 1)
    ...         )
    ...     ).alias("a+b+c")
    ... )
    shape: (4, 3)
    ┌─────┬─────┬───────┐
    │ a   ┆ b   ┆ a+b+c │
    │ --- ┆ --- ┆ ---   │
    │ i64 ┆ i64 ┆ i64   │
    ╞═════╪═════╪═══════╡
    │ 1   ┆ 4   ┆ 6     │
    │ 2   ┆ 5   ┆ 8     │
    │ 3   ┆ 6   ┆ 10    │
    │ 4   ┆ 7   ┆ 12    │
    └─────┴─────┴───────┘
    """
    exprs = selection_to_pyexpr_list(exprs)
    return wrap_expr(
        _map_mul(
            exprs, function, return_dtype, apply_groups=False, returns_scalar=False
        )
    )


def apply(
    exprs: Sequence[str | Expr],
    function: Callable[[Sequence[Series]], Series | Any],
    return_dtype: PolarsDataType | None = None,
    *,
    returns_scalar: bool = True,
) -> Expr:
    """
    Apply a custom/user-defined function (UDF) in a GroupBy context.

    Depending on the context it has the following behavior:

    * Select
        Don't use apply, use `map`
    * GroupBy
        expected type `f`: Callable[[Series], Series]
        Applies a python function over each group.

    Parameters
    ----------
    exprs
        Input Series to f
    function
        Function to apply over the input
    return_dtype
        dtype of the output Series
    returns_scalar
        If the function returns a single scalar as output.

    Returns
    -------
    Expr

    Examples
    --------
    >>> df = pl.DataFrame(
    ...     {
    ...         "a": [7, 2, 3, 4],
    ...         "b": [2, 5, 6, 7],
    ...     }
    ... )
    >>> df
    shape: (4, 2)
    ┌─────┬─────┐
    │ a   ┆ b   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 7   ┆ 2   │
    │ 2   ┆ 5   │
    │ 3   ┆ 6   │
    │ 4   ┆ 7   │
    └─────┴─────┘

    Calculate product of ``a``.

    >>> df.with_columns(pl.col("a").apply(lambda x: x * x).alias("product_a"))
    shape: (4, 3)
    ┌─────┬─────┬───────────┐
    │ a   ┆ b   ┆ product_a │
    │ --- ┆ --- ┆ ---       │
    │ i64 ┆ i64 ┆ i64       │
    ╞═════╪═════╪═══════════╡
    │ 7   ┆ 2   ┆ 49        │
    │ 2   ┆ 5   ┆ 4         │
    │ 3   ┆ 6   ┆ 9         │
    │ 4   ┆ 7   ┆ 16        │
    └─────┴─────┴───────────┘
    """
    exprs = selection_to_pyexpr_list(exprs)
    return wrap_expr(
        _map_mul(
            exprs,
            function,
            return_dtype,
            apply_groups=True,
            returns_scalar=returns_scalar,
        )
    )


def fold(
    acc: IntoExpr,
    function: Callable[[Series, Series], Series],
    exprs: Sequence[Expr | str] | Expr,
) -> Expr:
    """
    Accumulate over multiple columns horizontally/ row wise with a left fold.

    Parameters
    ----------
    acc
        Accumulator Expression. This is the value that will be initialized when the fold
        starts. For a sum this could for instance be lit(0).
    function
        Function to apply over the accumulator and the value.
        Fn(acc, value) -> new_value
    exprs
        Expressions to aggregate over. May also be a wildcard expression.

    Notes
    -----
    If you simply want the first encountered expression as accumulator,
    consider using ``reduce``.

    Examples
    --------
    >>> df = pl.DataFrame(
    ...     {
    ...         "a": [1, 2, 3],
    ...         "b": [3, 4, 5],
    ...         "c": [5, 6, 7],
    ...     }
    ... )
    >>> df
    shape: (3, 3)
    ┌─────┬─────┬─────┐
    │ a   ┆ b   ┆ c   │
    │ --- ┆ --- ┆ --- │
    │ i64 ┆ i64 ┆ i64 │
    ╞═════╪═════╪═════╡
    │ 1   ┆ 3   ┆ 5   │
    │ 2   ┆ 4   ┆ 6   │
    │ 3   ┆ 5   ┆ 7   │
    └─────┴─────┴─────┘

    Horizontally sum over all columns and add 1.

    >>> df.select(
    ...     pl.fold(
    ...         acc=pl.lit(1), function=lambda acc, x: acc + x, exprs=pl.col("*")
    ...     ).alias("sum"),
    ... )
    shape: (3, 1)
    ┌─────┐
    │ sum │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 10  │
    │ 13  │
    │ 16  │
    └─────┘

    You can also apply a condition/predicate on all columns:

    >>> df = pl.DataFrame(
    ...     {
    ...         "a": [1, 2, 3],
    ...         "b": [0, 1, 2],
    ...     }
    ... )
    >>> df
    shape: (3, 2)
    ┌─────┬─────┐
    │ a   ┆ b   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 1   ┆ 0   │
    │ 2   ┆ 1   │
    │ 3   ┆ 2   │
    └─────┴─────┘

    >>> df.filter(
    ...     pl.fold(
    ...         acc=pl.lit(True),
    ...         function=lambda acc, x: acc & x,
    ...         exprs=pl.col("*") > 1,
    ...     )
    ... )
    shape: (1, 2)
    ┌─────┬─────┐
    │ a   ┆ b   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 3   ┆ 2   │
    └─────┴─────┘
    """
    # in case of pl.col("*")
    acc = expr_to_lit_or_expr(acc, str_to_lit=True)
    if isinstance(exprs, pl.Expr):
        exprs = [exprs]

    exprs = selection_to_pyexpr_list(exprs)
    return wrap_expr(_fold(acc._pyexpr, function, exprs))


def reduce(
    function: Callable[[Series, Series], Series],
    exprs: Sequence[Expr | str] | Expr,
) -> Expr:
    """
    Accumulate over multiple columns horizontally/ row wise with a left fold.

    Parameters
    ----------
    function
        Function to apply over the accumulator and the value.
        Fn(acc, value) -> new_value
    exprs
        Expressions to aggregate over. May also be a wildcard expression.

    Notes
    -----
    See ``fold`` for the version with an explicit accumulator.

    Examples
    --------
    >>> df = pl.DataFrame(
    ...     {
    ...         "a": [1, 2, 3],
    ...         "b": [0, 1, 2],
    ...     }
    ... )
    >>> df
    shape: (3, 2)
    ┌─────┬─────┐
    │ a   ┆ b   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 1   ┆ 0   │
    │ 2   ┆ 1   │
    │ 3   ┆ 2   │
    └─────┴─────┘

    Horizontally sum over all columns.

    >>> df.select(
    ...     pl.reduce(function=lambda acc, x: acc + x, exprs=pl.col("*")).alias("sum"),
    ... )
    shape: (3, 1)
    ┌─────┐
    │ sum │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 1   │
    │ 3   │
    │ 5   │
    └─────┘

    """
    # in case of pl.col("*")
    if isinstance(exprs, pl.Expr):
        exprs = [exprs]

    exprs = selection_to_pyexpr_list(exprs)
    return wrap_expr(_reduce(function, exprs))


def cumfold(
    acc: IntoExpr,
    function: Callable[[Series, Series], Series],
    exprs: Sequence[Expr | str] | Expr,
    *,
    include_init: bool = False,
) -> Expr:
    """
    Cumulatively accumulate over multiple columns horizontally/ row wise with a left fold.

    Every cumulative result is added as a separate field in a Struct column.

    Parameters
    ----------
    acc
        Accumulator Expression. This is the value that will be initialized when the fold
        starts. For a sum this could for instance be lit(0).
    function
        Function to apply over the accumulator and the value.
        Fn(acc, value) -> new_value
    exprs
        Expressions to aggregate over. May also be a wildcard expression.
    include_init
        Include the initial accumulator state as struct field.

    Notes
    -----
    If you simply want the first encountered expression as accumulator,
    consider using ``cumreduce``.

    Examples
    --------
    >>> df = pl.DataFrame(
    ...     {
    ...         "a": [1, 2, 3],
    ...         "b": [3, 4, 5],
    ...         "c": [5, 6, 7],
    ...     }
    ... )
    >>> df
    shape: (3, 3)
    ┌─────┬─────┬─────┐
    │ a   ┆ b   ┆ c   │
    │ --- ┆ --- ┆ --- │
    │ i64 ┆ i64 ┆ i64 │
    ╞═════╪═════╪═════╡
    │ 1   ┆ 3   ┆ 5   │
    │ 2   ┆ 4   ┆ 6   │
    │ 3   ┆ 5   ┆ 7   │
    └─────┴─────┴─────┘

    >>> df.select(
    ...     pl.cumfold(
    ...         acc=pl.lit(1), function=lambda acc, x: acc + x, exprs=pl.col("*")
    ...     ).alias("cumfold"),
    ... )
    shape: (3, 1)
    ┌───────────┐
    │ cumfold   │
    │ ---       │
    │ struct[3] │
    ╞═══════════╡
    │ {2,5,10}  │
    │ {3,7,13}  │
    │ {4,9,16}  │
    └───────────┘

    """  # noqa: W505
    # in case of pl.col("*")
    acc = expr_to_lit_or_expr(acc, str_to_lit=True)
    if isinstance(exprs, pl.Expr):
        exprs = [exprs]

    exprs = selection_to_pyexpr_list(exprs)
    return wrap_expr(_cumfold(acc._pyexpr, function, exprs, include_init))


def cumreduce(
    function: Callable[[Series, Series], Series],
    exprs: Sequence[Expr | str] | Expr,
) -> Expr:
    """
    Cumulatively accumulate over multiple columns horizontally/ row wise with a left fold.

    Every cumulative result is added as a separate field in a Struct column.

    Parameters
    ----------
    function
        Function to apply over the accumulator and the value.
        Fn(acc, value) -> new_value
    exprs
        Expressions to aggregate over. May also be a wildcard expression.

    Examples
    --------
    >>> df = pl.DataFrame(
    ...     {
    ...         "a": [1, 2, 3],
    ...         "b": [3, 4, 5],
    ...         "c": [5, 6, 7],
    ...     }
    ... )
    >>> df
    shape: (3, 3)
    ┌─────┬─────┬─────┐
    │ a   ┆ b   ┆ c   │
    │ --- ┆ --- ┆ --- │
    │ i64 ┆ i64 ┆ i64 │
    ╞═════╪═════╪═════╡
    │ 1   ┆ 3   ┆ 5   │
    │ 2   ┆ 4   ┆ 6   │
    │ 3   ┆ 5   ┆ 7   │
    └─────┴─────┴─────┘

    >>> df.select(
    ...     pl.cumreduce(function=lambda acc, x: acc + x, exprs=pl.col("*")).alias(
    ...         "cumreduce"
    ...     ),
    ... )
    shape: (3, 1)
    ┌───────────┐
    │ cumreduce │
    │ ---       │
    │ struct[3] │
    ╞═══════════╡
    │ {1,4,9}   │
    │ {2,6,12}  │
    │ {3,8,15}  │
    └───────────┘
    """  # noqa: W505
    # in case of pl.col("*")
    if isinstance(exprs, pl.Expr):
        exprs = [exprs]

    exprs = selection_to_pyexpr_list(exprs)
    return wrap_expr(_cumreduce(function, exprs))


@overload
def any(exprs: Series) -> bool:  # type: ignore[misc]
    ...


@overload
def any(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr:
    ...


@deprecated_alias(columns="exprs")
def any(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr | bool:
    """
    Evaluate a bitwise OR operation.

    If a single string is passed, this is an alias for ``pl.col(name).any()``.
    If a single Series is passed, this is an alias for ``Series.any()``.

    Otherwise, this function computes the bitwise OR horizontally across multiple
    columns.

    Parameters
    ----------
    exprs
        Column(s) to use in the aggregation. Accepts expression input. Strings are
        parsed as column names, other non-expression inputs are parsed as literals.
    *more_exprs
        Additional columns to use in the aggregation, specified as positional arguments.

    Examples
    --------
    >>> df = pl.DataFrame(
    ...     {
    ...         "a": [True, False, True],
    ...         "b": [False, False, False],
    ...         "c": [False, True, False],
    ...     }
    ... )
    >>> df
    shape: (3, 3)
    ┌───────┬───────┬───────┐
    │ a     ┆ b     ┆ c     │
    │ ---   ┆ ---   ┆ ---   │
    │ bool  ┆ bool  ┆ bool  │
    ╞═══════╪═══════╪═══════╡
    │ true  ┆ false ┆ false │
    │ false ┆ false ┆ true  │
    │ true  ┆ false ┆ false │
    └───────┴───────┴───────┘

    Compares the values (in binary format) and return true if any value in the column
    is true.

    >>> df.select(pl.any("*"))
    shape: (1, 3)
    ┌──────┬───────┬──────┐
    │ a    ┆ b     ┆ c    │
    │ ---  ┆ ---   ┆ ---  │
    │ bool ┆ bool  ┆ bool │
    ╞══════╪═══════╪══════╡
    │ true ┆ false ┆ true │
    └──────┴───────┴──────┘

    Across multiple columns:

    >>> df.select(pl.any("a", "b"))
    shape: (3, 1)
    ┌───────┐
    │ any   │
    │ ---   │
    │ bool  │
    ╞═══════╡
    │ true  │
    │ false │
    │ true  │
    └───────┘

    """
    if not more_exprs:
        if isinstance(exprs, pl.Series):
            return exprs.any()
        elif isinstance(exprs, str):
            return col(exprs).any()

    exprs = selection_to_pyexpr_list(exprs)
    if more_exprs:
        exprs.extend(selection_to_pyexpr_list(more_exprs))

    exprs_wrapped = [wrap_expr(e) for e in exprs]
    return fold(
        lit(False), lambda a, b: a.cast(bool) | b.cast(bool), exprs_wrapped
    ).alias("any")


@overload
def all(exprs: Series) -> bool:  # type: ignore[misc]
    ...


@overload
def all(
    exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr
) -> Expr:
    ...


@deprecated_alias(columns="exprs")
def all(
    exprs: IntoExpr | Iterable[IntoExpr] | None = None, *more_exprs: IntoExpr
) -> Expr | bool:
    """
    Either return an expression representing all columns, or evaluate a bitwise AND operation.

    If no arguments are passed, this is an alias for ``pl.col("*")``.
    If a single string is passed, this is an alias for ``pl.col(name).any()``.
    If a single Series is passed, this is an alias for ``Series.any()``.

    Otherwise, this function computes the bitwise AND horizontally across multiple
    columns.

    Parameters
    ----------
    exprs
        Column(s) to use in the aggregation. Accepts expression input. Strings are
        parsed as column names, other non-expression inputs are parsed as literals.
    *more_exprs
        Additional columns to use in the aggregation, specified as positional arguments.

    Examples
    --------
    Selecting all columns and calculating the sum:

    >>> df = pl.DataFrame(
    ...     {"a": [1, 2, 3], "b": ["hello", "foo", "bar"], "c": [1, 1, 1]}
    ... )
    >>> df.select(pl.all().sum())
    shape: (1, 3)
    ┌─────┬──────┬─────┐
    │ a   ┆ b    ┆ c   │
    │ --- ┆ ---  ┆ --- │
    │ i64 ┆ str  ┆ i64 │
    ╞═════╪══════╪═════╡
    │ 6   ┆ null ┆ 3   │
    └─────┴──────┴─────┘

    Bitwise AND across multiple columns:

    >>> df = pl.DataFrame(
    ...     {
    ...         "a": [True, False, True],
    ...         "b": [True, False, False],
    ...         "c": [False, True, False],
    ...     }
    ... )
    >>> df.select(pl.all("a", "b"))
    shape: (3, 1)
    ┌───────┐
    │ all   │
    │ ---   │
    │ bool  │
    ╞═══════╡
    │ true  │
    │ false │
    │ false │
    └───────┘

    """  # noqa: W505
    if not more_exprs:
        if exprs is None:
            return col("*")
        elif isinstance(exprs, pl.Series):
            return exprs.all()
        elif isinstance(exprs, str):
            return col(exprs).all()

    exprs = selection_to_pyexpr_list(exprs)
    if more_exprs:
        exprs.extend(selection_to_pyexpr_list(more_exprs))

    exprs_wrapped = [wrap_expr(e) for e in exprs]
    return fold(
        lit(True), lambda a, b: a.cast(bool) & b.cast(bool), exprs_wrapped
    ).alias("all")


[docs]def exclude(
    columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType],
    *more_columns: str | PolarsDataType,
) -> Expr:
    """
    Represent all columns except for the given columns.

    Syntactic sugar for ``pl.all().exclude(columns)``.

    Parameters
    ----------
    columns
        The name or datatype of the column(s) to exclude. Accepts regular expression
        input. Regular expressions should start with ``^`` and end with ``$``.
    *more_columns
        Additional names or datatypes of columns to exclude, specified as positional
        arguments.

    Examples
    --------
    Exclude by column name(s):

    >>> df = pl.DataFrame(
    ...     {
    ...         "aa": [1, 2, 3],
    ...         "ba": ["a", "b", None],
    ...         "cc": [None, 2.5, 1.5],
    ...     }
    ... )
    >>> df.select(pl.exclude("ba"))
    shape: (3, 2)
    ┌─────┬──────┐
    │ aa  ┆ cc   │
    │ --- ┆ ---  │
    │ i64 ┆ f64  │
    ╞═════╪══════╡
    │ 1   ┆ null │
    │ 2   ┆ 2.5  │
    │ 3   ┆ 1.5  │
    └─────┴──────┘

    Exclude by regex, e.g. removing all columns whose names end with the letter "a":

    >>> df.select(pl.exclude("^.*a$"))
    shape: (3, 1)
    ┌──────┐
    │ cc   │
    │ ---  │
    │ f64  │
    ╞══════╡
    │ null │
    │ 2.5  │
    │ 1.5  │
    └──────┘

    Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64:

    >>> df.select(pl.exclude([pl.Int64, pl.Float64]))
    shape: (3, 1)
    ┌──────┐
    │ ba   │
    │ ---  │
    │ str  │
    ╞══════╡
    │ a    │
    │ b    │
    │ null │
    └──────┘

    """
    return col("*").exclude(columns, *more_columns)


def groups(column: str) -> Expr:
    """Syntactic sugar for `pl.col("foo").agg_groups()`."""
    return col(column).agg_groups()


def quantile(
    column: str,
    quantile: float | Expr,
    interpolation: RollingInterpolationMethod = "nearest",
) -> Expr:
    """
    Syntactic sugar for `pl.col("foo").quantile(..)`.

    Parameters
    ----------
    column
        Column name.
    quantile
        Quantile between 0.0 and 1.0.
    interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'}
        Interpolation method.

    """
    return col(column).quantile(quantile, interpolation)


@overload
def arange(
    start: int | Expr | Series,
    end: int | Expr | Series,
    step: int = ...,
    *,
    eager: Literal[False],
) -> Expr:
    ...


@overload
def arange(
    start: int | Expr | Series,
    end: int | Expr | Series,
    step: int = ...,
    *,
    eager: Literal[True],
    dtype: PolarsDataType | None = ...,
) -> Series:
    ...


@overload
def arange(
    start: int | Expr | Series,
    end: int | Expr | Series,
    step: int = ...,
    *,
    eager: bool = ...,
    dtype: PolarsDataType | None = ...,
) -> Expr | Series:
    ...


@deprecated_alias(low="start", high="end")
def arange(
    start: int | Expr | Series,
    end: int | Expr | Series,
    step: int = 1,
    *,
    eager: bool = False,
    dtype: PolarsDataType | None = None,
) -> Expr | Series:
    """
    Create a range expression (or Series).

    This can be used in a `select`, `with_column` etc. Be sure that the resulting
    range size is equal to the length of the DataFrame you are collecting.

    Examples
    --------
    >>> df.lazy().filter(pl.col("foo") < pl.arange(0, 100)).collect()  # doctest: +SKIP

    Parameters
    ----------
    start
        Lower bound of range.
    end
        Upper bound of range.
    step
        Step size of the range.
    eager
        Evaluate immediately and return a ``Series``. If set to ``False`` (default),
        return an expression instead.
    dtype
        Apply an explicit integer dtype to the resulting expression (default is Int64).

    """
    start = expr_to_lit_or_expr(start, str_to_lit=False)
    end = expr_to_lit_or_expr(end, str_to_lit=False)
    range_expr = wrap_expr(_arange(start._pyexpr, end._pyexpr, step))

    if dtype is not None and dtype != Int64:
        range_expr = range_expr.cast(dtype)
    if not eager:
        return range_expr
    else:
        return (
            pl.DataFrame()
            .select(range_expr)
            .to_series()
            .rename("arange", in_place=True)
        )


def arg_sort_by(
    exprs: IntoExpr | Iterable[IntoExpr],
    *more_exprs: IntoExpr,
    descending: bool | Sequence[bool] = False,
) -> Expr:
    """
    Return the row indices that would sort the columns.

    Parameters
    ----------
    exprs
        Column(s) to arg sort by. Accepts expression input. Strings are parsed as column
        names.
    *more_exprs
        Additional columns to arg sort by, specified as positional arguments.
    descending
        Sort in descending order. When sorting by multiple columns, can be specified
        per column by passing a sequence of booleans.

    Examples
    --------
    Pass a single column name to compute the arg sort by that column.

    >>> df = pl.DataFrame(
    ...     {
    ...         "a": [0, 1, 1, 0],
    ...         "b": [3, 2, 3, 2],
    ...     }
    ... )
    >>> df.select(pl.arg_sort_by("a"))
    shape: (4, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ u32 │
    ╞═════╡
    │ 0   │
    │ 3   │
    │ 1   │
    │ 2   │
    └─────┘

    Compute the arg sort by multiple columns by either passing a list of columns, or by
    specifying each column as a positional argument.

    >>> df.select(pl.arg_sort_by(["a", "b"], descending=True))
    shape: (4, 1)
    ┌─────┐
    │ a   │
    │ --- │
    │ u32 │
    ╞═════╡
    │ 2   │
    │ 1   │
    │ 0   │
    │ 3   │
    └─────┘

    """
    exprs = selection_to_pyexpr_list(exprs)
    if more_exprs:
        exprs.extend(selection_to_pyexpr_list(more_exprs))

    if isinstance(descending, bool):
        descending = [descending] * len(exprs)
    elif len(exprs) != len(descending):
        raise ValueError(
            f"the length of `descending` ({len(descending)}) does not match the length of `exprs` ({len(exprs)})"
        )
    return wrap_expr(_arg_sort_by(exprs, descending))


def duration(
    *,
    days: Expr | str | int | None = None,
    seconds: Expr | str | int | None = None,
    nanoseconds: Expr | str | int | None = None,
    microseconds: Expr | str | int | None = None,
    milliseconds: Expr | str | int | None = None,
    minutes: Expr | str | int | None = None,
    hours: Expr | str | int | None = None,
    weeks: Expr | str | int | None = None,
) -> Expr:
    """
    Create polars `Duration` from distinct time components.

    Returns
    -------
    Expr of type `pl.Duration`

    Examples
    --------
    >>> from datetime import datetime
    >>> df = pl.DataFrame(
    ...     {
    ...         "dt": [datetime(2022, 1, 1), datetime(2022, 1, 2)],
    ...         "add": [1, 2],
    ...     }
    ... )
    >>> print(df)
    shape: (2, 2)
    ┌─────────────────────┬─────┐
    │ dt                  ┆ add │
    │ ---                 ┆ --- │
    │ datetime[μs]        ┆ i64 │
    ╞═════════════════════╪═════╡
    │ 2022-01-01 00:00:00 ┆ 1   │
    │ 2022-01-02 00:00:00 ┆ 2   │
    └─────────────────────┴─────┘
    >>> with pl.Config(tbl_width_chars=120):
    ...     df.select(
    ...         (pl.col("dt") + pl.duration(weeks="add")).alias("add_weeks"),
    ...         (pl.col("dt") + pl.duration(days="add")).alias("add_days"),
    ...         (pl.col("dt") + pl.duration(seconds="add")).alias("add_seconds"),
    ...         (pl.col("dt") + pl.duration(milliseconds="add")).alias("add_millis"),
    ...         (pl.col("dt") + pl.duration(hours="add")).alias("add_hours"),
    ...     )
    ...
    shape: (2, 5)
    ┌─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────────┬─────────────────────┐
    │ add_weeks           ┆ add_days            ┆ add_seconds         ┆ add_millis              ┆ add_hours           │
    │ ---                 ┆ ---                 ┆ ---                 ┆ ---                     ┆ ---                 │
    │ datetime[μs]        ┆ datetime[μs]        ┆ datetime[μs]        ┆ datetime[μs]            ┆ datetime[μs]        │
    ╞═════════════════════╪═════════════════════╪═════════════════════╪═════════════════════════╪═════════════════════╡
    │ 2022-01-08 00:00:00 ┆ 2022-01-02 00:00:00 ┆ 2022-01-01 00:00:01 ┆ 2022-01-01 00:00:00.001 ┆ 2022-01-01 01:00:00 │
    │ 2022-01-16 00:00:00 ┆ 2022-01-04 00:00:00 ┆ 2022-01-02 00:00:02 ┆ 2022-01-02 00:00:00.002 ┆ 2022-01-02 02:00:00 │
    └─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┴─────────────────────┘

    """  # noqa: W505
    if hours is not None:
        hours = expr_to_lit_or_expr(hours, str_to_lit=False)._pyexpr
    if minutes is not None:
        minutes = expr_to_lit_or_expr(minutes, str_to_lit=False)._pyexpr
    if seconds is not None:
        seconds = expr_to_lit_or_expr(seconds, str_to_lit=False)._pyexpr
    if milliseconds is not None:
        milliseconds = expr_to_lit_or_expr(milliseconds, str_to_lit=False)._pyexpr
    if microseconds is not None:
        microseconds = expr_to_lit_or_expr(microseconds, str_to_lit=False)._pyexpr
    if nanoseconds is not None:
        nanoseconds = expr_to_lit_or_expr(nanoseconds, str_to_lit=False)._pyexpr
    if days is not None:
        days = expr_to_lit_or_expr(days, str_to_lit=False)._pyexpr
    if weeks is not None:
        weeks = expr_to_lit_or_expr(weeks, str_to_lit=False)._pyexpr

    return wrap_expr(
        _duration(
            days,
            seconds,
            nanoseconds,
            microseconds,
            milliseconds,
            minutes,
            hours,
            weeks,
        )
    )


def datetime_(
    year: Expr | str | int,
    month: Expr | str | int,
    day: Expr | str | int,
    hour: Expr | str | int | None = None,
    minute: Expr | str | int | None = None,
    second: Expr | str | int | None = None,
    microsecond: Expr | str | int | None = None,
) -> Expr:
    """
    Create a Polars literal expression of type Datetime.

    Parameters
    ----------
    year
        column or literal.
    month
        column or literal, ranging from 1-12.
    day
        column or literal, ranging from 1-31.
    hour
        column or literal, ranging from 1-23.
    minute
        column or literal, ranging from 1-59.
    second
        column or literal, ranging from 1-59.
    microsecond
        column or literal, ranging from 1-999999.

    Returns
    -------
    Expr of type `pl.Datetime`

    """
    year_expr = expr_to_lit_or_expr(year, str_to_lit=False)
    month_expr = expr_to_lit_or_expr(month, str_to_lit=False)
    day_expr = expr_to_lit_or_expr(day, str_to_lit=False)

    if hour is not None:
        hour = expr_to_lit_or_expr(hour, str_to_lit=False)._pyexpr
    if minute is not None:
        minute = expr_to_lit_or_expr(minute, str_to_lit=False)._pyexpr
    if second is not None:
        second = expr_to_lit_or_expr(second, str_to_lit=False)._pyexpr
    if microsecond is not None:
        microsecond = expr_to_lit_or_expr(microsecond, str_to_lit=False)._pyexpr

    return wrap_expr(
        _datetime(
            year_expr._pyexpr,
            month_expr._pyexpr,
            day_expr._pyexpr,
            hour,
            minute,
            second,
            microsecond,
        )
    )


def date_(
    year: Expr | str | int,
    month: Expr | str | int,
    day: Expr | str | int,
) -> Expr:
    """
    Create a Polars literal expression of type Date.

    Parameters
    ----------
    year
        column or literal.
    month
        column or literal, ranging from 1-12.
    day
        column or literal, ranging from 1-31.

    Returns
    -------
    Expr of type pl.Date

    """
    return datetime_(year, month, day).cast(Date).alias("date")


def concat_str(
    exprs: IntoExpr | Iterable[IntoExpr],
    *more_exprs: IntoExpr,
    separator: str = "",
) -> Expr:
    """
    Horizontally concatenate columns into a single string column.

    Operates in linear time.

    Parameters
    ----------
    exprs
        Columns to concatenate into a single string column. Accepts expression input.
        Strings are parsed as column names, other non-expression inputs are parsed as
        literals. Non-``Utf8`` columns are cast to ``Utf8``.
    *more_exprs
        Additional columns to concatenate into a single string column, specified as
        positional arguments.
    separator
        String that will be used to separate the values of each column.

    Examples
    --------
    >>> df = pl.DataFrame(
    ...     {
    ...         "a": [1, 2, 3],
    ...         "b": ["dogs", "cats", None],
    ...         "c": ["play", "swim", "walk"],
    ...     }
    ... )
    >>> df.with_columns(
    ...     pl.concat_str(
    ...         [
    ...             pl.col("a") * 2,
    ...             pl.col("b"),
    ...             pl.col("c"),
    ...         ],
    ...         separator=" ",
    ...     ).alias("full_sentence"),
    ... )
    shape: (3, 4)
    ┌─────┬──────┬──────┬───────────────┐
    │ a   ┆ b    ┆ c    ┆ full_sentence │
    │ --- ┆ ---  ┆ ---  ┆ ---           │
    │ i64 ┆ str  ┆ str  ┆ str           │
    ╞═════╪══════╪══════╪═══════════════╡
    │ 1   ┆ dogs ┆ play ┆ 2 dogs play   │
    │ 2   ┆ cats ┆ swim ┆ 4 cats swim   │
    │ 3   ┆ null ┆ walk ┆ null          │
    └─────┴──────┴──────┴───────────────┘

    """
    exprs = selection_to_pyexpr_list(exprs)
    if more_exprs:
        exprs.extend(selection_to_pyexpr_list(more_exprs))
    return wrap_expr(_concat_str(exprs, separator))


def format(f_string: str, *args: Expr | str) -> Expr:
    """
    Format expressions as a string.

    Parameters
    ----------
    f_string
        A string that with placeholders.
        For example: "hello_{}" or "{}_world
    args
        Expression(s) that fill the placeholders

    Examples
    --------
    >>> df = pl.DataFrame(
    ...     {
    ...         "a": ["a", "b", "c"],
    ...         "b": [1, 2, 3],
    ...     }
    ... )
    >>> df.select(
    ...     [
    ...         pl.format("foo_{}_bar_{}", pl.col("a"), "b").alias("fmt"),
    ...     ]
    ... )
    shape: (3, 1)
    ┌─────────────┐
    │ fmt         │
    │ ---         │
    │ str         │
    ╞═════════════╡
    │ foo_a_bar_1 │
    │ foo_b_bar_2 │
    │ foo_c_bar_3 │
    └─────────────┘

    """
    if f_string.count("{}") != len(args):
        raise ValueError("number of placeholders should equal the number of arguments")

    exprs = []

    arguments = iter(args)
    for i, s in enumerate(f_string.split("{}")):
        if i > 0:
            e = expr_to_lit_or_expr(next(arguments), str_to_lit=False)
            exprs.append(e)

        if len(s) > 0:
            exprs.append(lit(s))

    return concat_str(exprs, separator="")


def concat_list(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr:
    """
    Horizontally concatenate columns into a single list column.

    Operates in linear time.

    Parameters
    ----------
    exprs
        Columns to concatenate into a single list column. Accepts expression input.
        Strings are parsed as column names, other non-expression inputs are parsed as
        literals.
    *more_exprs
        Additional columns to concatenate into a single list column, specified as
        positional arguments.

    Examples
    --------
    Create lagged columns and collect them into a list. This mimics a rolling window.

    >>> df = pl.DataFrame({"A": [1.0, 2.0, 9.0, 2.0, 13.0]})
    >>> df = df.select([pl.col("A").shift(i).alias(f"A_lag_{i}") for i in range(3)])
    >>> df.select(
    ...     pl.concat_list([f"A_lag_{i}" for i in range(3)][::-1]).alias("A_rolling")
    ... )
    shape: (5, 1)
    ┌───────────────────┐
    │ A_rolling         │
    │ ---               │
    │ list[f64]         │
    ╞═══════════════════╡
    │ [null, null, 1.0] │
    │ [null, 1.0, 2.0]  │
    │ [1.0, 2.0, 9.0]   │
    │ [2.0, 9.0, 2.0]   │
    │ [9.0, 2.0, 13.0]  │
    └───────────────────┘

    """
    exprs = selection_to_pyexpr_list(exprs)
    if more_exprs:
        exprs.extend(selection_to_pyexpr_list(more_exprs))
    return wrap_expr(_concat_list(exprs))


def collect_all(
    lazy_frames: Sequence[LazyFrame],
    *,
    type_coercion: bool = True,
    predicate_pushdown: bool = True,
    projection_pushdown: bool = True,
    simplify_expression: bool = True,
    no_optimization: bool = False,
    slice_pushdown: bool = True,
    common_subplan_elimination: bool = True,
    streaming: bool = False,
) -> list[DataFrame]:
    """
    Collect multiple LazyFrames at the same time.

    This runs all the computation graphs in parallel on Polars threadpool.

    Parameters
    ----------
    lazy_frames
        A list of LazyFrames to collect.
    type_coercion
        Do type coercion optimization.
    predicate_pushdown
        Do predicate pushdown optimization.
    projection_pushdown
        Do projection pushdown optimization.
    simplify_expression
        Run simplify expressions optimization.
    no_optimization
        Turn off optimizations.
    slice_pushdown
        Slice pushdown optimization.
    common_subplan_elimination
        Will try to cache branching subplans that occur on self-joins or unions.
    streaming
        Run parts of the query in a streaming fashion (this is in an alpha state)

    Returns
    -------
    List[DataFrame]

    """
    if no_optimization:
        predicate_pushdown = False
        projection_pushdown = False
        slice_pushdown = False
        common_subplan_elimination = False

    prepared = []

    for lf in lazy_frames:
        ldf = lf._ldf.optimization_toggle(
            type_coercion,
            predicate_pushdown,
            projection_pushdown,
            simplify_expression,
            slice_pushdown,
            common_subplan_elimination,
            streaming,
        )
        prepared.append(ldf)

    out = _collect_all(prepared)

    # wrap the pydataframes into dataframe
    result = [wrap_df(pydf) for pydf in out]

    return result


def select(
    exprs: IntoExpr | Iterable[IntoExpr] | None = None,
    *more_exprs: IntoExpr,
    **named_exprs: IntoExpr,
) -> DataFrame:
    """
    Run polars expressions without a context.

    This is syntactic sugar for running ``df.select`` on an empty DataFrame.

    Parameters
    ----------
    exprs
        Expression or expressions to run.
    *more_exprs
        Additional expressions to run, specified as positional arguments.
    **named_exprs
        Additional expressions to run, specified as keyword arguments. The expressions
        will be renamed to the keyword used.

    Returns
    -------
    DataFrame

    Examples
    --------
    >>> foo = pl.Series("foo", [1, 2, 3])
    >>> bar = pl.Series("bar", [3, 2, 1])
    >>> pl.select(pl.min([foo, bar]))
    shape: (3, 1)
    ┌─────┐
    │ min │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 1   │
    │ 2   │
    │ 1   │
    └─────┘

    """
    return pl.DataFrame().select(exprs, *more_exprs, **named_exprs)


@overload
def struct(
    exprs: IntoExpr | Iterable[IntoExpr] = ...,
    *more_exprs: IntoExpr,
    eager: Literal[False] = ...,
    schema: SchemaDict | None = ...,
    **named_exprs: IntoExpr,
) -> Expr:
    ...


@overload
def struct(
    exprs: IntoExpr | Iterable[IntoExpr] = ...,
    *more_exprs: IntoExpr,
    eager: Literal[True],
    schema: SchemaDict | None = ...,
    **named_exprs: IntoExpr,
) -> Series:
    ...


@overload
def struct(
    exprs: IntoExpr | Iterable[IntoExpr] = ...,
    *more_exprs: IntoExpr,
    eager: bool,
    schema: SchemaDict | None = ...,
    **named_exprs: IntoExpr,
) -> Expr | Series:
    ...


def struct(
    exprs: IntoExpr | Iterable[IntoExpr] = None,
    *more_exprs: IntoExpr,
    eager: bool = False,
    schema: SchemaDict | None = None,
    **named_exprs: IntoExpr,
) -> Expr | Series:
    """
    Collect columns into a struct column.

    Parameters
    ----------
    exprs
        Column(s) to collect into a struct column. Accepts expression input. Strings are
        parsed as column names, other non-expression inputs are parsed as literals.
    *more_exprs
        Additional columns to collect into the struct column, specified as positional
        arguments.
    eager
        Evaluate immediately and return a ``Series``. If set to ``False`` (default),
        return an expression instead.
    schema
        Optional schema that explicitly defines the struct field dtypes.
    **named_exprs
        Additional columns to collect into the struct column, specified as keyword
        arguments. The columns will be renamed to the keyword used.

    Examples
    --------
    Collect all columns of a dataframe into a struct by passing ``pl.all()``.

    >>> df = pl.DataFrame(
    ...     {
    ...         "int": [1, 2],
    ...         "str": ["a", "b"],
    ...         "bool": [True, None],
    ...         "list": [[1, 2], [3]],
    ...     }
    ... )
    >>> df.select(pl.struct(pl.all()).alias("my_struct"))
    shape: (2, 1)
    ┌─────────────────────┐
    │ my_struct           │
    │ ---                 │
    │ struct[4]           │
    ╞═════════════════════╡
    │ {1,"a",true,[1, 2]} │
    │ {2,"b",null,[3]}    │
    └─────────────────────┘

    Collect selected columns into a struct by either passing a list of columns, or by
    specifying each column as a positional argument.

    >>> df.select(pl.struct("int", False).alias("my_struct"))
    shape: (2, 1)
    ┌───────────┐
    │ my_struct │
    │ ---       │
    │ struct[2] │
    ╞═══════════╡
    │ {1,false} │
    │ {2,false} │
    └───────────┘

    Use keyword arguments to easily name each struct field.

    >>> df.select(pl.struct(p="int", q="bool").alias("my_struct")).schema
    {'my_struct': Struct([Field('p', Int64), Field('q', Boolean)])}

    """
    exprs = selection_to_pyexpr_list(exprs)
    if more_exprs:
        exprs.extend(selection_to_pyexpr_list(more_exprs))
    if named_exprs:
        exprs.extend(
            expr_to_lit_or_expr(expr, name=name, str_to_lit=False)._pyexpr
            for name, expr in named_exprs.items()
        )

    expr = wrap_expr(_as_struct(exprs))
    if schema:
        expr = expr.cast(Struct(schema), strict=False)

    if eager:
        return select(expr).to_series()
    else:
        return expr


@overload
def repeat(
    value: float | int | str | bool | None,
    n: Expr | int,
    *,
    eager: Literal[False] = ...,
    name: str | None = ...,
) -> Expr:
    ...


@overload
def repeat(
    value: float | int | str | bool | None,
    n: Expr | int,
    *,
    eager: Literal[True],
    name: str | None = ...,
) -> Series:
    ...


@overload
def repeat(
    value: float | int | str | bool | None,
    n: Expr | int,
    *,
    eager: bool,
    name: str | None,
) -> Expr | Series:
    ...


def repeat(
    value: float | int | str | bool | None,
    n: Expr | int,
    *,
    eager: bool = False,
    name: str | None = None,
) -> Expr | Series:
    """
    Repeat a single value n times.

    Parameters
    ----------
    value
        Value to repeat.
    n
        repeat `n` times
    eager
        Evaluate immediately and return a ``Series``. If set to ``False`` (default),
        return an expression instead.
    name
        Only used in `eager` mode. As expression, use `alias`

    """
    if eager:
        if name is None:
            name = ""
        dtype = py_type_to_dtype(type(value))
        if (
            dtype == Int64
            and isinstance(value, int)
            and -(2**31) <= value <= 2**31 - 1
        ):
            dtype = Int32
        s = pl.Series._repeat(name, value, n, dtype)  # type: ignore[arg-type]
        return s
    else:
        if isinstance(n, int):
            n = lit(n)
        return wrap_expr(_repeat(value, n._pyexpr))


@overload
def arg_where(condition: Expr | Series, *, eager: Literal[False] = ...) -> Expr:
    ...


@overload
def arg_where(condition: Expr | Series, *, eager: Literal[True]) -> Series:
    ...


@overload
def arg_where(condition: Expr | Series, *, eager: bool) -> Expr | Series:
    ...


def arg_where(condition: Expr | Series, *, eager: bool = False) -> Expr | Series:
    """
    Return indices where `condition` evaluates `True`.

    Parameters
    ----------
    condition
        Boolean expression to evaluate
    eager
        Evaluate immediately and return a ``Series``. If set to ``False`` (default),
        return an expression instead.

    Examples
    --------
    >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
    >>> df.select(
    ...     [
    ...         pl.arg_where(pl.col("a") % 2 == 0),
    ...     ]
    ... ).to_series()
    shape: (2,)
    Series: 'a' [u32]
    [
        1
        3
    ]

    See Also
    --------
    Series.arg_true : Return indices where Series is True

    """
    if eager:
        if not isinstance(condition, pl.Series):
            raise ValueError(
                "expected 'Series' in 'arg_where' if 'eager=True', got"
                f" {type(condition)}"
            )
        return condition.to_frame().select(arg_where(col(condition.name))).to_series()
    else:
        condition = expr_to_lit_or_expr(condition, str_to_lit=True)
        return wrap_expr(_arg_where(condition._pyexpr))


def coalesce(exprs: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Expr:
    """
    Folds the columns from left to right, keeping the first non-null value.

    Parameters
    ----------
    exprs
        Columns to coalesce. Accepts expression input. Strings are parsed as column
        names, other non-expression inputs are parsed as literals.
    *more_exprs
        Additional columns to coalesce, specified as positional arguments.

    Examples
    --------
    >>> df = pl.DataFrame(
    ...     {
    ...         "a": [1, None, None, None],
    ...         "b": [1, 2, None, None],
    ...         "c": [5, None, 3, None],
    ...     }
    ... )
    >>> df.with_columns(pl.coalesce(["a", "b", "c", 10]).alias("d"))
    shape: (4, 4)
    ┌──────┬──────┬──────┬─────┐
    │ a    ┆ b    ┆ c    ┆ d   │
    │ ---  ┆ ---  ┆ ---  ┆ --- │
    │ i64  ┆ i64  ┆ i64  ┆ i64 │
    ╞══════╪══════╪══════╪═════╡
    │ 1    ┆ 1    ┆ 5    ┆ 1   │
    │ null ┆ 2    ┆ null ┆ 2   │
    │ null ┆ null ┆ 3    ┆ 3   │
    │ null ┆ null ┆ null ┆ 10  │
    └──────┴──────┴──────┴─────┘
    >>> df.with_columns(pl.coalesce(pl.col(["a", "b", "c"]), 10.0).alias("d"))
    shape: (4, 4)
    ┌──────┬──────┬──────┬──────┐
    │ a    ┆ b    ┆ c    ┆ d    │
    │ ---  ┆ ---  ┆ ---  ┆ ---  │
    │ i64  ┆ i64  ┆ i64  ┆ f64  │
    ╞══════╪══════╪══════╪══════╡
    │ 1    ┆ 1    ┆ 5    ┆ 1.0  │
    │ null ┆ 2    ┆ null ┆ 2.0  │
    │ null ┆ null ┆ 3    ┆ 3.0  │
    │ null ┆ null ┆ null ┆ 10.0 │
    └──────┴──────┴──────┴──────┘

    """
    exprs = selection_to_pyexpr_list(exprs)
    if more_exprs:
        exprs.extend(selection_to_pyexpr_list(more_exprs))
    return wrap_expr(_coalesce(exprs))


@overload
def from_epoch(column: str | Expr, time_unit: EpochTimeUnit = ...) -> Expr:
    ...


@overload
def from_epoch(
    column: Series | Sequence[int], time_unit: EpochTimeUnit = ...
) -> Series:
    ...


def from_epoch(
    column: str | Expr | Series | Sequence[int], time_unit: EpochTimeUnit = "s"
) -> Expr | Series:
    """
    Utility function that parses an epoch timestamp (or Unix time) to Polars Date(time).

    Depending on the `time_unit` provided, this function will return a different dtype:
    - time_unit="d" returns pl.Date
    - time_unit="s" returns pl.Datetime["us"] (pl.Datetime's default)
    - time_unit="ms" returns pl.Datetime["ms"]
    - time_unit="us" returns pl.Datetime["us"]
    - time_unit="ns" returns pl.Datetime["ns"]

    Parameters
    ----------
    column
        Series or expression to parse integers to pl.Datetime.
    time_unit
        The unit of time of the timesteps since epoch time.

    Examples
    --------
    >>> df = pl.DataFrame({"timestamp": [1666683077, 1666683099]}).lazy()
    >>> df.select(pl.from_epoch(pl.col("timestamp"), time_unit="s")).collect()
    shape: (2, 1)
    ┌─────────────────────┐
    │ timestamp           │
    │ ---                 │
    │ datetime[μs]        │
    ╞═════════════════════╡
    │ 2022-10-25 07:31:17 │
    │ 2022-10-25 07:31:39 │
    └─────────────────────┘

    The function can also be used in an eager context by passing a Series.

    >>> s = pl.Series([12345, 12346])
    >>> pl.from_epoch(s, time_unit="d")
    shape: (2,)
    Series: '' [date]
    [
            2003-10-20
            2003-10-21
    ]

    """
    if isinstance(column, str):
        column = col(column)
    elif not isinstance(column, (pl.Series, pl.Expr)):
        column = pl.Series(column)  # Sequence input handled by Series constructor

    if time_unit == "d":
        return column.cast(Date)
    elif time_unit == "s":
        return (column.cast(Int64) * 1_000_000).cast(Datetime("us"))
    elif time_unit in DTYPE_TEMPORAL_UNITS:
        return column.cast(Datetime(time_unit))
    else:
        raise ValueError(
            f"'time_unit' must be one of {{'ns', 'us', 'ms', 's', 'd'}}, got {time_unit!r}."
        )