Source code for polars.expr.expr

from __future__ import annotations

import contextlib
import math
import operator
import warnings
from collections.abc import Collection, Mapping, Sequence
from datetime import timedelta
from functools import reduce
from io import BytesIO, StringIO
from pathlib import Path
from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    ClassVar,
    NoReturn,
    TypeVar,
)

import polars._reexport as pl
from polars import functions as F
from polars._utils.convert import negate_duration_string, parse_as_duration_string
from polars._utils.deprecation import (
    deprecate_function,
    deprecate_renamed_parameter,
    issue_deprecation_warning,
)
from polars._utils.parse import (
    parse_into_expression,
    parse_into_list_of_expressions,
    parse_predicates_constraints_into_expression,
)
from polars._utils.unstable import issue_unstable_warning, unstable
from polars._utils.various import (
    BUILDING_SPHINX_DOCS,
    extend_bool,
    find_stacklevel,
    no_default,
    normalize_filepath,
    sphinx_accessor,
    warn_null_comparison,
)
from polars.datatypes import Int64, is_polars_dtype, parse_into_dtype
from polars.dependencies import _check_for_numpy
from polars.dependencies import numpy as np
from polars.exceptions import CustomUFuncWarning, PolarsInefficientMapWarning
from polars.expr.array import ExprArrayNameSpace
from polars.expr.binary import ExprBinaryNameSpace
from polars.expr.categorical import ExprCatNameSpace
from polars.expr.datetime import ExprDateTimeNameSpace
from polars.expr.list import ExprListNameSpace
from polars.expr.meta import ExprMetaNameSpace
from polars.expr.name import ExprNameNameSpace
from polars.expr.string import ExprStringNameSpace
from polars.expr.struct import ExprStructNameSpace
from polars.meta import thread_pool_size

with contextlib.suppress(ImportError):  # Module not available when building docs
    from polars.polars import arg_where as py_arg_where

with contextlib.suppress(ImportError):  # Module not available when building docs
    from polars.polars import PyExpr

if TYPE_CHECKING:
    import sys
    from collections.abc import Iterable
    from io import IOBase

    from polars import DataFrame, LazyFrame, Series
    from polars._typing import (
        ClosedInterval,
        FillNullStrategy,
        InterpolationMethod,
        IntoExpr,
        IntoExprColumn,
        MapElementsStrategy,
        NullBehavior,
        NumericLiteral,
        PolarsDataType,
        RankMethod,
        RollingInterpolationMethod,
        SearchSortedSide,
        SerializationFormat,
        TemporalLiteral,
        WindowMappingStrategy,
    )
    from polars._utils.various import (
        NoDefault,
    )

    if sys.version_info >= (3, 11):
        from typing import Concatenate, ParamSpec
    else:
        from typing_extensions import Concatenate, ParamSpec

    T = TypeVar("T")
    P = ParamSpec("P")

elif BUILDING_SPHINX_DOCS:
    property = sphinx_accessor


[docs] class Expr: """Expressions that can be used in various contexts.""" _pyexpr: PyExpr = None _accessors: ClassVar[set[str]] = { "arr", "cat", "dt", "list", "meta", "name", "str", "bin", "struct", } @classmethod def _from_pyexpr(cls, pyexpr: PyExpr) -> Expr: expr = cls.__new__(cls) expr._pyexpr = pyexpr return expr def _repr_html_(self) -> str: return self._pyexpr.to_str() def __repr__(self) -> str: if len(expr_str := self._pyexpr.to_str()) > 30: expr_str = f"{expr_str[:30]}…" return f"<{self.__class__.__name__} [{expr_str!r}] at 0x{id(self):X}>" def __str__(self) -> str: return self._pyexpr.to_str() def __bool__(self) -> NoReturn: msg = ( "the truth value of an Expr is ambiguous" "\n\n" "You probably got here by using a Python standard library function instead " "of the native expressions API.\n" "Here are some things you might want to try:\n" "- instead of `pl.col('a') and pl.col('b')`, use `pl.col('a') & pl.col('b')`\n" "- instead of `pl.col('a') in [y, z]`, use `pl.col('a').is_in([y, z])`\n" "- instead of `max(pl.col('a'), pl.col('b'))`, use `pl.max_horizontal(pl.col('a'), pl.col('b'))`\n" ) raise TypeError(msg) def __abs__(self) -> Expr: return self.abs() # operators def __add__(self, other: IntoExpr) -> Expr: other = parse_into_expression(other, str_as_lit=True) return self._from_pyexpr(self._pyexpr + other) def __radd__(self, other: IntoExpr) -> Expr: other = parse_into_expression(other, str_as_lit=True) return self._from_pyexpr(other + self._pyexpr) def __and__(self, other: IntoExprColumn | int | bool) -> Expr: other = parse_into_expression(other) return self._from_pyexpr(self._pyexpr.and_(other)) def __rand__(self, other: IntoExprColumn | int | bool) -> Expr: other_expr = parse_into_expression(other) return self._from_pyexpr(other_expr.and_(self._pyexpr)) def __eq__(self, other: IntoExpr) -> Expr: # type: ignore[override] warn_null_comparison(other) other = parse_into_expression(other, str_as_lit=True) return self._from_pyexpr(self._pyexpr.eq(other)) def __floordiv__(self, other: IntoExpr) -> Expr: other = parse_into_expression(other) return self._from_pyexpr(self._pyexpr // other) def __rfloordiv__(self, other: IntoExpr) -> Expr: other = parse_into_expression(other) return self._from_pyexpr(other // self._pyexpr) def __ge__(self, other: IntoExpr) -> Expr: warn_null_comparison(other) other = parse_into_expression(other, str_as_lit=True) return self._from_pyexpr(self._pyexpr.gt_eq(other)) def __gt__(self, other: IntoExpr) -> Expr: warn_null_comparison(other) other = parse_into_expression(other, str_as_lit=True) return self._from_pyexpr(self._pyexpr.gt(other)) def __invert__(self) -> Expr: return self.not_() def __le__(self, other: IntoExpr) -> Expr: warn_null_comparison(other) other = parse_into_expression(other, str_as_lit=True) return self._from_pyexpr(self._pyexpr.lt_eq(other)) def __lt__(self, other: IntoExpr) -> Expr: warn_null_comparison(other) other = parse_into_expression(other, str_as_lit=True) return self._from_pyexpr(self._pyexpr.lt(other)) def __mod__(self, other: IntoExpr) -> Expr: other = parse_into_expression(other) return self._from_pyexpr(self._pyexpr % other) def __rmod__(self, other: IntoExpr) -> Expr: other = parse_into_expression(other) return self._from_pyexpr(other % self._pyexpr) def __mul__(self, other: IntoExpr) -> Expr: other = parse_into_expression(other) return self._from_pyexpr(self._pyexpr * other) def __rmul__(self, other: IntoExpr) -> Expr: other = parse_into_expression(other) return self._from_pyexpr(other * self._pyexpr) def __ne__(self, other: IntoExpr) -> Expr: # type: ignore[override] warn_null_comparison(other) other = parse_into_expression(other, str_as_lit=True) return self._from_pyexpr(self._pyexpr.neq(other)) def __neg__(self) -> Expr: return self._from_pyexpr(-self._pyexpr) def __or__(self, other: IntoExprColumn | int | bool) -> Expr: other = parse_into_expression(other) return self._from_pyexpr(self._pyexpr.or_(other)) def __ror__(self, other: IntoExprColumn | int | bool) -> Expr: other_expr = parse_into_expression(other) return self._from_pyexpr(other_expr.or_(self._pyexpr)) def __pos__(self) -> Expr: return self def __pow__(self, exponent: IntoExprColumn | int | float) -> Expr: exponent = parse_into_expression(exponent) return self._from_pyexpr(self._pyexpr.pow(exponent)) def __rpow__(self, base: IntoExprColumn | int | float) -> Expr: base = parse_into_expression(base) return self._from_pyexpr(base) ** self def __sub__(self, other: IntoExpr) -> Expr: other = parse_into_expression(other) return self._from_pyexpr(self._pyexpr - other) def __rsub__(self, other: IntoExpr) -> Expr: other = parse_into_expression(other) return self._from_pyexpr(other - self._pyexpr) def __truediv__(self, other: IntoExpr) -> Expr: other = parse_into_expression(other) return self._from_pyexpr(self._pyexpr / other) def __rtruediv__(self, other: IntoExpr) -> Expr: other = parse_into_expression(other) return self._from_pyexpr(other / self._pyexpr) def __xor__(self, other: IntoExprColumn | int | bool) -> Expr: other = parse_into_expression(other) return self._from_pyexpr(self._pyexpr.xor_(other)) def __rxor__(self, other: IntoExprColumn | int | bool) -> Expr: other_expr = parse_into_expression(other) return self._from_pyexpr(other_expr.xor_(self._pyexpr)) def __getstate__(self) -> bytes: return self._pyexpr.__getstate__() def __setstate__(self, state: bytes) -> None: self._pyexpr = F.lit(0)._pyexpr # Initialize with a dummy self._pyexpr.__setstate__(state) def __array_ufunc__( self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any ) -> Expr: """Numpy universal functions.""" if method != "__call__": msg = f"Only call is implemented not {method}" raise NotImplementedError(msg) # Numpy/Scipy ufuncs have signature None but numba signatures always exists. is_custom_ufunc = getattr(ufunc, "signature") is not None # noqa: B009 num_expr = sum(isinstance(inp, Expr) for inp in inputs) exprs = [ (inp, True, i) if isinstance(inp, Expr) else (inp, False, i) for i, inp in enumerate(inputs) ] if num_expr == 1: root_expr = next(expr[0] for expr in exprs if expr[1]) else: # We rename all but the first expression in case someone did e.g. # np.divide(pl.col("a"), pl.col("a")); we'll be creating a struct # below, and structs can't have duplicate names. first_renameable_expr = True actual_exprs = [] for inp, is_actual_expr, index in exprs: if is_actual_expr: if first_renameable_expr: first_renameable_expr = False else: inp = inp.alias(f"argument_{index}") actual_exprs.append(inp) root_expr = F.struct(actual_exprs) def function(s: Series) -> Series: # pragma: no cover args = [] for i, expr in enumerate(exprs): if expr[1] and num_expr > 1: args.append(s.struct[i]) elif expr[1]: args.append(s) else: args.append(expr[0]) return ufunc(*args, **kwargs) if is_custom_ufunc is True: msg = ( "Native numpy ufuncs are dispatched using `map_batches(ufunc, is_elementwise=True)` which " "is safe for native Numpy and Scipy ufuncs but custom ufuncs in a group_by " "context won't be properly grouped. Custom ufuncs are dispatched with is_elementwise=False. " f"If {ufunc.__name__} needs elementwise then please use map_batches directly." ) warnings.warn( msg, CustomUFuncWarning, stacklevel=find_stacklevel(), ) return root_expr.map_batches(function, is_elementwise=False) return root_expr.map_batches(function, is_elementwise=True) @classmethod def deserialize( cls, source: str | Path | IOBase | bytes, *, format: SerializationFormat = "binary", ) -> Expr: """ Read a serialized expression from a file. Parameters ---------- source Path to a file or a file-like object (by file-like object, we refer to objects that have a `read()` method, such as a file handler (e.g. via builtin `open` function) or `BytesIO`). format The format with which the Expr was serialized. Options: - `"binary"`: Deserialize from binary format (bytes). This is the default. - `"json"`: Deserialize from JSON format (string). Warnings -------- This function uses :mod:`pickle` if the logical plan contains Python UDFs, and as such inherits the security implications. Deserializing can execute arbitrary code, so it should only be attempted on trusted data. See Also -------- Expr.meta.serialize Notes ----- Serialization is not stable across Polars versions: a LazyFrame serialized in one Polars version may not be deserializable in another Polars version. Examples -------- >>> import io >>> expr = pl.col("foo").sum().over("bar") >>> bytes = expr.meta.serialize() >>> pl.Expr.deserialize(io.BytesIO(bytes)) # doctest: +ELLIPSIS <Expr ['col("foo").sum().over([col("ba…'] at ...> """ if isinstance(source, StringIO): source = BytesIO(source.getvalue().encode()) elif isinstance(source, (str, Path)): source = normalize_filepath(source) elif isinstance(source, bytes): source = BytesIO(source) if format == "binary": deserializer = PyExpr.deserialize_binary elif format == "json": deserializer = PyExpr.deserialize_json else: msg = f"`format` must be one of {{'binary', 'json'}}, got {format!r}" raise ValueError(msg) return cls._from_pyexpr(deserializer(source)) def to_physical(self) -> Expr: """ Cast to physical representation of the logical dtype. - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` - `List(inner)` -> `List(physical of inner)` - `Array(inner)` -> `Struct(physical of inner)` - `Struct(fields)` -> `Array(physical of fields)` Other data types will be left unchanged. Warning ------- The physical representations are an implementation detail and not guaranteed to be stable. Examples -------- Replicating the pandas `pd.factorize <https://pandas.pydata.org/docs/reference/api/pandas.factorize.html>`_ function. >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( ... pl.col("vals").cast(pl.Categorical), ... pl.col("vals") ... .cast(pl.Categorical) ... .to_physical() ... .alias("vals_physical"), ... ) shape: (4, 2) ┌──────┬───────────────┐ │ vals ┆ vals_physical │ │ --- ┆ --- │ │ cat ┆ u32 │ ╞══════╪═══════════════╡ │ a ┆ 0 │ │ x ┆ 1 │ │ null ┆ null │ │ a ┆ 0 │ └──────┴───────────────┘ """ return self._from_pyexpr(self._pyexpr.to_physical()) def any(self, *, ignore_nulls: bool = True) -> Expr: """ Return whether any of the values in the column are `True`. Only works on columns of data type :class:`Boolean`. Parameters ---------- ignore_nulls Ignore null values (default). If set to `False`, `Kleene logic`_ is used to deal with nulls: if the column contains any null values and no `True` values, the output is null. .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic Returns ------- Expr Expression of data type :class:`Boolean`. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [True, False], ... "b": [False, False], ... "c": [None, False], ... } ... ) >>> df.select(pl.col("*").any()) shape: (1, 3) ┌──────┬───────┬───────┐ │ a ┆ b ┆ c │ │ --- ┆ --- ┆ --- │ │ bool ┆ bool ┆ bool │ ╞══════╪═══════╪═══════╡ │ true ┆ false ┆ false │ └──────┴───────┴───────┘ Enable Kleene logic by setting `ignore_nulls=False`. >>> df.select(pl.col("*").any(ignore_nulls=False)) shape: (1, 3) ┌──────┬───────┬──────┐ │ a ┆ b ┆ c │ │ --- ┆ --- ┆ --- │ │ bool ┆ bool ┆ bool │ ╞══════╪═══════╪══════╡ │ true ┆ false ┆ null │ └──────┴───────┴──────┘ """ return self._from_pyexpr(self._pyexpr.any(ignore_nulls)) def all(self, *, ignore_nulls: bool = True) -> Expr: """ Return whether all values in the column are `True`. Only works on columns of data type :class:`Boolean`. .. note:: This method is not to be confused with the function :func:`polars.all`, which can be used to select all columns. Parameters ---------- ignore_nulls Ignore null values (default). If set to `False`, `Kleene logic`_ is used to deal with nulls: if the column contains any null values and no `True` values, the output is null. .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic Returns ------- Expr Expression of data type :class:`Boolean`. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [True, True], ... "b": [False, True], ... "c": [None, True], ... } ... ) >>> df.select(pl.col("*").all()) shape: (1, 3) ┌──────┬───────┬──────┐ │ a ┆ b ┆ c │ │ --- ┆ --- ┆ --- │ │ bool ┆ bool ┆ bool │ ╞══════╪═══════╪══════╡ │ true ┆ false ┆ true │ └──────┴───────┴──────┘ Enable Kleene logic by setting `ignore_nulls=False`. >>> df.select(pl.col("*").all(ignore_nulls=False)) shape: (1, 3) ┌──────┬───────┬──────┐ │ a ┆ b ┆ c │ │ --- ┆ --- ┆ --- │ │ bool ┆ bool ┆ bool │ ╞══════╪═══════╪══════╡ │ true ┆ false ┆ null │ └──────┴───────┴──────┘ """ return self._from_pyexpr(self._pyexpr.all(ignore_nulls)) def arg_true(self) -> Expr: """ Return indices where expression evaluates `True`. .. warning:: Modifies number of rows returned, so will fail in combination with other expressions. Use as only expression in `select` / `with_columns`. See Also -------- Series.arg_true : Return indices where Series is True polars.arg_where Examples -------- >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) >>> df.select((pl.col("a") == 1).arg_true()) shape: (3, 1) ┌─────┐ │ a │ │ --- │ │ u32 │ ╞═════╡ │ 0 │ │ 1 │ │ 3 │ └─────┘ """ return self._from_pyexpr(py_arg_where(self._pyexpr)) def sqrt(self) -> Expr: """ Compute the square root of the elements. Examples -------- >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) >>> df.select(pl.col("values").sqrt()) shape: (3, 1) ┌──────────┐ │ values │ │ --- │ │ f64 │ ╞══════════╡ │ 1.0 │ │ 1.414214 │ │ 2.0 │ └──────────┘ """ return self._from_pyexpr(self._pyexpr.sqrt()) def cbrt(self) -> Expr: """ Compute the cube root of the elements. Examples -------- >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) >>> df.select(pl.col("values").cbrt()) shape: (3, 1) ┌──────────┐ │ values │ │ --- │ │ f64 │ ╞══════════╡ │ 1.0 │ │ 1.259921 │ │ 1.587401 │ └──────────┘ """ return self._from_pyexpr(self._pyexpr.cbrt()) def log10(self) -> Expr: """ Compute the base 10 logarithm of the input array, element-wise. Examples -------- >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) >>> df.select(pl.col("values").log10()) shape: (3, 1) ┌─────────┐ │ values │ │ --- │ │ f64 │ ╞═════════╡ │ 0.0 │ │ 0.30103 │ │ 0.60206 │ └─────────┘ """ return self.log(10.0) def exp(self) -> Expr: """ Compute the exponential, element-wise. Examples -------- >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) >>> df.select(pl.col("values").exp()) shape: (3, 1) ┌──────────┐ │ values │ │ --- │ │ f64 │ ╞══════════╡ │ 2.718282 │ │ 7.389056 │ │ 54.59815 │ └──────────┘ """ return self._from_pyexpr(self._pyexpr.exp()) def alias(self, name: str) -> Expr: """ Rename the expression. Parameters ---------- name The new name. See Also -------- name.map name.prefix name.suffix Examples -------- Rename an expression to avoid overwriting an existing column. >>> df = pl.DataFrame( ... { ... "a": [1, 2, 3], ... "b": ["x", "y", "z"], ... } ... ) >>> df.with_columns( ... pl.col("a") + 10, ... pl.col("b").str.to_uppercase().alias("c"), ... ) shape: (3, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ c │ │ --- ┆ --- ┆ --- │ │ i64 ┆ str ┆ str │ ╞═════╪═════╪═════╡ │ 11 ┆ x ┆ X │ │ 12 ┆ y ┆ Y │ │ 13 ┆ z ┆ Z │ └─────┴─────┴─────┘ Overwrite the default name of literal columns to prevent errors due to duplicate column names. >>> df.with_columns( ... pl.lit(True).alias("c"), ... pl.lit(4.0).alias("d"), ... ) shape: (3, 4) ┌─────┬─────┬──────┬─────┐ │ a ┆ b ┆ c ┆ d │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ str ┆ bool ┆ f64 │ ╞═════╪═════╪══════╪═════╡ │ 1 ┆ x ┆ true ┆ 4.0 │ │ 2 ┆ y ┆ true ┆ 4.0 │ │ 3 ┆ z ┆ true ┆ 4.0 │ └─────┴─────┴──────┴─────┘ """ return self._from_pyexpr(self._pyexpr.alias(name)) def exclude( self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType, ) -> Expr: """ Exclude columns from a multi-column expression. Only works after a wildcard or regex column selection, and you cannot provide both string column names *and* dtypes (you may prefer to use selectors instead). Parameters ---------- columns The name or datatype of the column(s) to exclude. Accepts regular expression input. Regular expressions should start with `^` and end with `$`. *more_columns Additional names or datatypes of columns to exclude, specified as positional arguments. Examples -------- >>> df = pl.DataFrame( ... { ... "aa": [1, 2, 3], ... "ba": ["a", "b", None], ... "cc": [None, 2.5, 1.5], ... } ... ) >>> df shape: (3, 3) ┌─────┬──────┬──────┐ │ aa ┆ ba ┆ cc │ │ --- ┆ --- ┆ --- │ │ i64 ┆ str ┆ f64 │ ╞═════╪══════╪══════╡ │ 1 ┆ a ┆ null │ │ 2 ┆ b ┆ 2.5 │ │ 3 ┆ null ┆ 1.5 │ └─────┴──────┴──────┘ Exclude by column name(s): >>> df.select(pl.all().exclude("ba")) shape: (3, 2) ┌─────┬──────┐ │ aa ┆ cc │ │ --- ┆ --- │ │ i64 ┆ f64 │ ╞═════╪══════╡ │ 1 ┆ null │ │ 2 ┆ 2.5 │ │ 3 ┆ 1.5 │ └─────┴──────┘ Exclude by regex, e.g. removing all columns whose names end with the letter "a": >>> df.select(pl.all().exclude("^.*a$")) shape: (3, 1) ┌──────┐ │ cc │ │ --- │ │ f64 │ ╞══════╡ │ null │ │ 2.5 │ │ 1.5 │ └──────┘ Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) shape: (3, 1) ┌──────┐ │ ba │ │ --- │ │ str │ ╞══════╡ │ a │ │ b │ │ null │ └──────┘ """ exclude_cols: list[str] = [] exclude_dtypes: list[PolarsDataType] = [] for item in ( *( columns if isinstance(columns, Collection) and not isinstance(columns, str) else [columns] ), *more_columns, ): if isinstance(item, str): exclude_cols.append(item) elif is_polars_dtype(item): exclude_dtypes.append(item) else: msg = ( "invalid input for `exclude`" f"\n\nExpected one or more `str` or `DataType`; found {item!r} instead." ) raise TypeError(msg) if exclude_cols and exclude_dtypes: msg = "cannot exclude by both column name and dtype; use a selector instead" raise TypeError(msg) elif exclude_dtypes: return self._from_pyexpr(self._pyexpr.exclude_dtype(exclude_dtypes)) else: return self._from_pyexpr(self._pyexpr.exclude(exclude_cols)) def pipe( self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs, ) -> T: r''' Offers a structured way to apply a sequence of user-defined functions (UDFs). Parameters ---------- function Callable; will receive the expression as the first parameter, followed by any given args/kwargs. *args Arguments to pass to the UDF. **kwargs Keyword arguments to pass to the UDF. Examples -------- >>> def extract_number(expr: pl.Expr) -> pl.Expr: ... """Extract the digits from a string.""" ... return expr.str.extract(r"\d+", 0).cast(pl.Int64) >>> >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: ... """Set even numbers negative, and scale by a user-supplied value.""" ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) ... return expr * n >>> >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) >>> df.with_columns( ... udfs=( ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) ... ), ... ) shape: (4, 2) ┌──────┬──────┐ │ val ┆ udfs │ │ --- ┆ --- │ │ str ┆ i64 │ ╞══════╪══════╡ │ a: 1 ┆ 5 │ │ b: 2 ┆ -10 │ │ c: 3 ┆ 15 │ │ d: 4 ┆ -20 │ └──────┴──────┘ ''' return function(self, *args, **kwargs) def not_(self) -> Expr: """ Negate a boolean expression. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [True, False, False], ... "b": ["a", "b", None], ... } ... ) >>> df shape: (3, 2) ┌───────┬──────┐ │ a ┆ b │ │ --- ┆ --- │ │ bool ┆ str │ ╞═══════╪══════╡ │ true ┆ a │ │ false ┆ b │ │ false ┆ null │ └───────┴──────┘ >>> df.select(pl.col("a").not_()) shape: (3, 1) ┌───────┐ │ a │ │ --- │ │ bool │ ╞═══════╡ │ false │ │ true │ │ true │ └───────┘ """ return self._from_pyexpr(self._pyexpr.not_()) def is_null(self) -> Expr: """ Returns a boolean Series indicating which values are null. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [1, 2, None, 1, 5], ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], ... } ... ) >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null shape: (5, 4) ┌──────┬─────┬──────────┬──────────┐ │ a ┆ b ┆ a_isnull ┆ b_isnull │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ f64 ┆ bool ┆ bool │ ╞══════╪═════╪══════════╪══════════╡ │ 1 ┆ 1.0 ┆ false ┆ false │ │ 2 ┆ 2.0 ┆ false ┆ false │ │ null ┆ NaN ┆ true ┆ false │ │ 1 ┆ 1.0 ┆ false ┆ false │ │ 5 ┆ 5.0 ┆ false ┆ false │ └──────┴─────┴──────────┴──────────┘ """ return self._from_pyexpr(self._pyexpr.is_null()) def is_not_null(self) -> Expr: """ Returns a boolean Series indicating which values are not null. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [1, 2, None, 1, 5], ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], ... } ... ) >>> df.with_columns( ... pl.all().is_not_null().name.suffix("_not_null") # nan != null ... ) shape: (5, 4) ┌──────┬─────┬────────────┬────────────┐ │ a ┆ b ┆ a_not_null ┆ b_not_null │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ f64 ┆ bool ┆ bool │ ╞══════╪═════╪════════════╪════════════╡ │ 1 ┆ 1.0 ┆ true ┆ true │ │ 2 ┆ 2.0 ┆ true ┆ true │ │ null ┆ NaN ┆ false ┆ true │ │ 1 ┆ 1.0 ┆ true ┆ true │ │ 5 ┆ 5.0 ┆ true ┆ true │ └──────┴─────┴────────────┴────────────┘ """ return self._from_pyexpr(self._pyexpr.is_not_null()) def is_finite(self) -> Expr: """ Returns a boolean Series indicating which values are finite. Returns ------- Expr Expression of data type :class:`Boolean`. Examples -------- >>> df = pl.DataFrame( ... { ... "A": [1.0, 2], ... "B": [3.0, float("inf")], ... } ... ) >>> df.select(pl.all().is_finite()) shape: (2, 2) ┌──────┬───────┐ │ A ┆ B │ │ --- ┆ --- │ │ bool ┆ bool │ ╞══════╪═══════╡ │ true ┆ true │ │ true ┆ false │ └──────┴───────┘ """ return self._from_pyexpr(self._pyexpr.is_finite()) def is_infinite(self) -> Expr: """ Returns a boolean Series indicating which values are infinite. Returns ------- Expr Expression of data type :class:`Boolean`. Examples -------- >>> df = pl.DataFrame( ... { ... "A": [1.0, 2], ... "B": [3.0, float("inf")], ... } ... ) >>> df.select(pl.all().is_infinite()) shape: (2, 2) ┌───────┬───────┐ │ A ┆ B │ │ --- ┆ --- │ │ bool ┆ bool │ ╞═══════╪═══════╡ │ false ┆ false │ │ false ┆ true │ └───────┴───────┘ """ return self._from_pyexpr(self._pyexpr.is_infinite()) def is_nan(self) -> Expr: """ Returns a boolean Series indicating which values are NaN. Notes ----- Floating point `NaN` (Not A Number) should not be confused with missing data represented as `Null/None`. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [1, 2, None, 1, 5], ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], ... } ... ) >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) shape: (5, 3) ┌──────┬─────┬─────────┐ │ a ┆ b ┆ b_isnan │ │ --- ┆ --- ┆ --- │ │ i64 ┆ f64 ┆ bool │ ╞══════╪═════╪═════════╡ │ 1 ┆ 1.0 ┆ false │ │ 2 ┆ 2.0 ┆ false │ │ null ┆ NaN ┆ true │ │ 1 ┆ 1.0 ┆ false │ │ 5 ┆ 5.0 ┆ false │ └──────┴─────┴─────────┘ """ return self._from_pyexpr(self._pyexpr.is_nan()) def is_not_nan(self) -> Expr: """ Returns a boolean Series indicating which values are not NaN. Notes ----- Floating point `NaN` (Not A Number) should not be confused with missing data represented as `Null/None`. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [1, 2, None, 1, 5], ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], ... } ... ) >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) shape: (5, 3) ┌──────┬─────┬──────────────┐ │ a ┆ b ┆ b_is_not_nan │ │ --- ┆ --- ┆ --- │ │ i64 ┆ f64 ┆ bool │ ╞══════╪═════╪══════════════╡ │ 1 ┆ 1.0 ┆ true │ │ 2 ┆ 2.0 ┆ true │ │ null ┆ NaN ┆ false │ │ 1 ┆ 1.0 ┆ true │ │ 5 ┆ 5.0 ┆ true │ └──────┴─────┴──────────────┘ """ return self._from_pyexpr(self._pyexpr.is_not_nan()) def agg_groups(self) -> Expr: """ Get the group indexes of the group by operation. Should be used in aggregation context only. Examples -------- >>> df = pl.DataFrame( ... { ... "group": [ ... "one", ... "one", ... "one", ... "two", ... "two", ... "two", ... ], ... "value": [94, 95, 96, 97, 97, 99], ... } ... ) >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) shape: (2, 2) ┌───────┬───────────┐ │ group ┆ value │ │ --- ┆ --- │ │ str ┆ list[u32] │ ╞═══════╪═══════════╡ │ one ┆ [0, 1, 2] │ │ two ┆ [3, 4, 5] │ └───────┴───────────┘ """ return self._from_pyexpr(self._pyexpr.agg_groups()) def count(self) -> Expr: """ Return the number of non-null elements in the column. Returns ------- Expr Expression of data type :class:`UInt32`. See Also -------- len Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) >>> df.select(pl.all().count()) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ u32 ┆ u32 │ ╞═════╪═════╡ │ 3 ┆ 2 │ └─────┴─────┘ """ return self._from_pyexpr(self._pyexpr.count()) def len(self) -> Expr: """ Return the number of elements in the column. Null values count towards the total. Returns ------- Expr Expression of data type :class:`UInt32`. See Also -------- count Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) >>> df.select(pl.all().len()) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ u32 ┆ u32 │ ╞═════╪═════╡ │ 3 ┆ 3 │ └─────┴─────┘ """ return self._from_pyexpr(self._pyexpr.len()) def slice(self, offset: int | Expr, length: int | Expr | None = None) -> Expr: """ Get a slice of this expression. Parameters ---------- offset Start index. Negative indexing is supported. length Length of the slice. If set to `None`, all rows starting at the offset will be selected. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [8, 9, 10, 11], ... "b": [None, 4, 4, 4], ... } ... ) >>> df.select(pl.all().slice(1, 2)) shape: (2, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪═════╡ │ 9 ┆ 4 │ │ 10 ┆ 4 │ └─────┴─────┘ """ if not isinstance(offset, Expr): offset = F.lit(offset) if not isinstance(length, Expr): length = F.lit(length) return self._from_pyexpr(self._pyexpr.slice(offset._pyexpr, length._pyexpr)) def append(self, other: IntoExpr, *, upcast: bool = True) -> Expr: """ Append expressions. This is done by adding the chunks of `other` to this `Series`. Parameters ---------- other Expression to append. upcast Cast both `Series` to the same supertype. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [8, 9, 10], ... "b": [None, 4, 4], ... } ... ) >>> df.select(pl.all().head(1).append(pl.all().tail(1))) shape: (2, 2) ┌─────┬──────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪══════╡ │ 8 ┆ null │ │ 10 ┆ 4 │ └─────┴──────┘ """ other = parse_into_expression(other) return self._from_pyexpr(self._pyexpr.append(other, upcast)) def rechunk(self) -> Expr: """ Create a single chunk of memory for this Series. Examples -------- >>> df = pl.DataFrame({"a": [1, 1, 2]}) Create a Series with 3 nulls, append column a then rechunk >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) shape: (6, 1) ┌────────┐ │ repeat │ │ --- │ │ i64 │ ╞════════╡ │ null │ │ null │ │ null │ │ 1 │ │ 1 │ │ 2 │ └────────┘ """ return self._from_pyexpr(self._pyexpr.rechunk()) def drop_nulls(self) -> Expr: """ Drop all null values. The original order of the remaining elements is preserved. See Also -------- drop_nans Notes ----- A null value is not the same as a NaN value. To drop NaN values, use :func:`drop_nans`. Examples -------- >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) >>> df.select(pl.col("a").drop_nulls()) shape: (3, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ 1.0 │ │ 3.0 │ │ NaN │ └─────┘ """ return self._from_pyexpr(self._pyexpr.drop_nulls()) def drop_nans(self) -> Expr: """ Drop all floating point NaN values. The original order of the remaining elements is preserved. See Also -------- drop_nulls Notes ----- A NaN value is not the same as a null value. To drop null values, use :func:`drop_nulls`. Examples -------- >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) >>> df.select(pl.col("a").drop_nans()) shape: (3, 1) ┌──────┐ │ a │ │ --- │ │ f64 │ ╞══════╡ │ 1.0 │ │ null │ │ 3.0 │ └──────┘ """ return self._from_pyexpr(self._pyexpr.drop_nans()) def cum_sum(self, *, reverse: bool = False) -> Expr: """ Get an array with the cumulative sum computed at every element. Parameters ---------- reverse Reverse the operation. Notes ----- Dtypes in {Int8, UInt8, Int16, UInt16} are cast to Int64 before summing to prevent overflow issues. Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) >>> df.with_columns( ... pl.col("a").cum_sum().alias("cum_sum"), ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), ... ) shape: (4, 3) ┌─────┬─────────┬─────────────────┐ │ a ┆ cum_sum ┆ cum_sum_reverse │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 │ ╞═════╪═════════╪═════════════════╡ │ 1 ┆ 1 ┆ 10 │ │ 2 ┆ 3 ┆ 9 │ │ 3 ┆ 6 ┆ 7 │ │ 4 ┆ 10 ┆ 4 │ └─────┴─────────┴─────────────────┘ Null values are excluded, but can also be filled by calling `forward_fill`. >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) >>> df.with_columns( ... pl.col("values").cum_sum().alias("value_cum_sum"), ... pl.col("values") ... .cum_sum() ... .forward_fill() ... .alias("value_cum_sum_all_filled"), ... ) shape: (8, 3) ┌────────┬───────────────┬──────────────────────────┐ │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 │ ╞════════╪═══════════════╪══════════════════════════╡ │ null ┆ null ┆ null │ │ 10 ┆ 10 ┆ 10 │ │ null ┆ null ┆ 10 │ │ 8 ┆ 18 ┆ 18 │ │ 9 ┆ 27 ┆ 27 │ │ null ┆ null ┆ 27 │ │ 16 ┆ 43 ┆ 43 │ │ null ┆ null ┆ 43 │ └────────┴───────────────┴──────────────────────────┘ """ return self._from_pyexpr(self._pyexpr.cum_sum(reverse)) def cum_prod(self, *, reverse: bool = False) -> Expr: """ Get an array with the cumulative product computed at every element. Parameters ---------- reverse Reverse the operation. Notes ----- Dtypes in {Int8, UInt8, Int16, UInt16} are cast to Int64 before summing to prevent overflow issues. Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) >>> df.with_columns( ... pl.col("a").cum_prod().alias("cum_prod"), ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), ... ) shape: (4, 3) ┌─────┬──────────┬──────────────────┐ │ a ┆ cum_prod ┆ cum_prod_reverse │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 │ ╞═════╪══════════╪══════════════════╡ │ 1 ┆ 1 ┆ 24 │ │ 2 ┆ 2 ┆ 24 │ │ 3 ┆ 6 ┆ 12 │ │ 4 ┆ 24 ┆ 4 │ └─────┴──────────┴──────────────────┘ """ return self._from_pyexpr(self._pyexpr.cum_prod(reverse)) def cum_min(self, *, reverse: bool = False) -> Expr: """ Get an array with the cumulative min computed at every element. Parameters ---------- reverse Reverse the operation. Examples -------- >>> df = pl.DataFrame({"a": [3, 1, 2]}) >>> df.with_columns( ... pl.col("a").cum_min().alias("cum_min"), ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), ... ) shape: (3, 3) ┌─────┬─────────┬─────────────────┐ │ a ┆ cum_min ┆ cum_min_reverse │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 │ ╞═════╪═════════╪═════════════════╡ │ 3 ┆ 3 ┆ 1 │ │ 1 ┆ 1 ┆ 1 │ │ 2 ┆ 1 ┆ 2 │ └─────┴─────────┴─────────────────┘ """ return self._from_pyexpr(self._pyexpr.cum_min(reverse)) def cum_max(self, *, reverse: bool = False) -> Expr: """ Get an array with the cumulative max computed at every element. Parameters ---------- reverse Reverse the operation. Examples -------- >>> df = pl.DataFrame({"a": [1, 3, 2]}) >>> df.with_columns( ... pl.col("a").cum_max().alias("cum_max"), ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), ... ) shape: (3, 3) ┌─────┬─────────┬─────────────────┐ │ a ┆ cum_max ┆ cum_max_reverse │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 │ ╞═════╪═════════╪═════════════════╡ │ 1 ┆ 1 ┆ 3 │ │ 3 ┆ 3 ┆ 3 │ │ 2 ┆ 3 ┆ 2 │ └─────┴─────────┴─────────────────┘ Null values are excluded, but can also be filled by calling `forward_fill`. >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) >>> df.with_columns( ... pl.col("values").cum_max().alias("cum_max"), ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), ... ) shape: (8, 3) ┌────────┬─────────┬────────────────────┐ │ values ┆ cum_max ┆ cum_max_all_filled │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 │ ╞════════╪═════════╪════════════════════╡ │ null ┆ null ┆ null │ │ 10 ┆ 10 ┆ 10 │ │ null ┆ null ┆ 10 │ │ 8 ┆ 10 ┆ 10 │ │ 9 ┆ 10 ┆ 10 │ │ null ┆ null ┆ 10 │ │ 16 ┆ 16 ┆ 16 │ │ null ┆ null ┆ 16 │ └────────┴─────────┴────────────────────┘ """ return self._from_pyexpr(self._pyexpr.cum_max(reverse)) def cum_count(self, *, reverse: bool = False) -> Expr: """ Return the cumulative count of the non-null values in the column. Parameters ---------- reverse Reverse the operation. Examples -------- >>> df = pl.DataFrame({"a": ["x", "k", None, "d"]}) >>> df.with_columns( ... pl.col("a").cum_count().alias("cum_count"), ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), ... ) shape: (4, 3) ┌──────┬───────────┬───────────────────┐ │ a ┆ cum_count ┆ cum_count_reverse │ │ --- ┆ --- ┆ --- │ │ str ┆ u32 ┆ u32 │ ╞══════╪═══════════╪═══════════════════╡ │ x ┆ 1 ┆ 3 │ │ k ┆ 2 ┆ 2 │ │ null ┆ 2 ┆ 1 │ │ d ┆ 3 ┆ 1 │ └──────┴───────────┴───────────────────┘ """ return self._from_pyexpr(self._pyexpr.cum_count(reverse)) def floor(self) -> Expr: """ Rounds down to the nearest integer value. Only works on floating point Series. Examples -------- >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) >>> df.select(pl.col("a").floor()) shape: (4, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ 0.0 │ │ 0.0 │ │ 1.0 │ │ 1.0 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.floor()) def ceil(self) -> Expr: """ Rounds up to the nearest integer value. Only works on floating point Series. Examples -------- >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) >>> df.select(pl.col("a").ceil()) shape: (4, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ 1.0 │ │ 1.0 │ │ 1.0 │ │ 2.0 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.ceil()) def round(self, decimals: int = 0) -> Expr: """ Round underlying floating point data by `decimals` digits. Parameters ---------- decimals Number of decimals to round by. Examples -------- >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) >>> df.select(pl.col("a").round(1)) shape: (4, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ 0.3 │ │ 0.5 │ │ 1.0 │ │ 1.2 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.round(decimals)) def round_sig_figs(self, digits: int) -> Expr: """ Round to a number of significant figures. Parameters ---------- digits Number of significant figures to round to. Examples -------- >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) shape: (3, 2) ┌─────────┬────────────────┐ │ a ┆ round_sig_figs │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════════╪════════════════╡ │ 0.01234 ┆ 0.012 │ │ 3.333 ┆ 3.3 │ │ 1234.0 ┆ 1200.0 │ └─────────┴────────────────┘ """ return self._from_pyexpr(self._pyexpr.round_sig_figs(digits)) def dot(self, other: Expr | str) -> Expr: """ Compute the dot/inner product between two Expressions. Parameters ---------- other Expression to compute dot product with. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [1, 3, 5], ... "b": [2, 4, 6], ... } ... ) >>> df.select(pl.col("a").dot(pl.col("b"))) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ i64 │ ╞═════╡ │ 44 │ └─────┘ """ other = parse_into_expression(other) return self._from_pyexpr(self._pyexpr.dot(other)) def mode(self) -> Expr: """ Compute the most occurring value(s). Can return multiple Values. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [1, 1, 2, 3], ... "b": [1, 1, 2, 2], ... } ... ) >>> df.select(pl.all().mode().first()) # doctest: +IGNORE_RESULT shape: (2, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪═════╡ │ 1 ┆ 1 │ └─────┴─────┘ """ return self._from_pyexpr(self._pyexpr.mode()) def cast( self, dtype: PolarsDataType | type[Any], *, strict: bool = True, wrap_numerical: bool = False, ) -> Expr: r""" Cast between data types. Parameters ---------- dtype DataType to cast to. strict If True invalid casts generate exceptions instead of `null`\s. wrap_numerical If True numeric casts wrap overflowing values instead of marking the cast as invalid. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [1, 2, 3], ... "b": ["4", "5", "6"], ... } ... ) >>> df.with_columns( ... pl.col("a").cast(pl.Float64), ... pl.col("b").cast(pl.Int32), ... ) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ f64 ┆ i32 │ ╞═════╪═════╡ │ 1.0 ┆ 4 │ │ 2.0 ┆ 5 │ │ 3.0 ┆ 6 │ └─────┴─────┘ """ dtype = parse_into_dtype(dtype) return self._from_pyexpr(self._pyexpr.cast(dtype, strict, wrap_numerical)) def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Expr: """ Sort this column. When used in a projection/selection context, the whole column is sorted. When used in a group by context, the groups are sorted. Parameters ---------- descending Sort in descending order. nulls_last Place null values last. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [1, None, 3, 2], ... } ... ) >>> df.select(pl.col("a").sort()) shape: (4, 1) ┌──────┐ │ a │ │ --- │ │ i64 │ ╞══════╡ │ null │ │ 1 │ │ 2 │ │ 3 │ └──────┘ >>> df.select(pl.col("a").sort(descending=True)) shape: (4, 1) ┌──────┐ │ a │ │ --- │ │ i64 │ ╞══════╡ │ null │ │ 3 │ │ 2 │ │ 1 │ └──────┘ >>> df.select(pl.col("a").sort(nulls_last=True)) shape: (4, 1) ┌──────┐ │ a │ │ --- │ │ i64 │ ╞══════╡ │ 1 │ │ 2 │ │ 3 │ │ null │ └──────┘ When sorting in a group by context, the groups are sorted. >>> df = pl.DataFrame( ... { ... "group": ["one", "one", "one", "two", "two", "two"], ... "value": [1, 98, 2, 3, 99, 4], ... } ... ) >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT shape: (2, 2) ┌───────┬────────────┐ │ group ┆ value │ │ --- ┆ --- │ │ str ┆ list[i64] │ ╞═══════╪════════════╡ │ two ┆ [3, 4, 99] │ │ one ┆ [1, 2, 98] │ └───────┴────────────┘ """ return self._from_pyexpr(self._pyexpr.sort_with(descending, nulls_last)) def top_k(self, k: int | IntoExprColumn = 5) -> Expr: r""" Return the `k` largest elements. Non-null elements are always preferred over null elements. The output is not guaranteed to be in any particular order, call :func:`sort` after this function if you wish the output to be sorted. This has time complexity: .. math:: O(n) Parameters ---------- k Number of elements to return. See Also -------- top_k_by bottom_k bottom_k_by Examples -------- Get the 5 largest values in series. >>> df = pl.DataFrame({"value": [1, 98, 2, 3, 99, 4]}) >>> df.select( ... pl.col("value").top_k().alias("top_k"), ... pl.col("value").bottom_k().alias("bottom_k"), ... ) shape: (5, 2) ┌───────┬──────────┐ │ top_k ┆ bottom_k │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═══════╪══════════╡ │ 4 ┆ 1 │ │ 98 ┆ 98 │ │ 2 ┆ 2 │ │ 3 ┆ 3 │ │ 99 ┆ 4 │ └───────┴──────────┘ """ k = parse_into_expression(k) return self._from_pyexpr(self._pyexpr.top_k(k)) @deprecate_renamed_parameter("descending", "reverse", version="1.0.0") def top_k_by( self, by: IntoExpr | Iterable[IntoExpr], k: int | IntoExprColumn = 5, *, reverse: bool | Sequence[bool] = False, ) -> Expr: r""" Return the elements corresponding to the `k` largest elements of the `by` column(s). Non-null elements are always preferred over null elements, regardless of the value of `reverse`. The output is not guaranteed to be in any particular order, call :func:`sort` after this function if you wish the output to be sorted. This has time complexity: .. math:: O(n \log{n}) Parameters ---------- by Column(s) used to determine the largest elements. Accepts expression input. Strings are parsed as column names. k Number of elements to return. reverse Consider the `k` smallest elements of the `by` column(s) (instead of the `k` largest). This can be specified per column by passing a sequence of booleans. See Also -------- top_k bottom_k bottom_k_by Examples -------- >>> df = pl.DataFrame( ... { ... "a": [1, 2, 3, 4, 5, 6], ... "b": [6, 5, 4, 3, 2, 1], ... "c": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"], ... } ... ) >>> df shape: (6, 3) ┌─────┬─────┬────────┐ │ a ┆ b ┆ c │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ str │ ╞═════╪═════╪════════╡ │ 1 ┆ 6 ┆ Apple │ │ 2 ┆ 5 ┆ Orange │ │ 3 ┆ 4 ┆ Apple │ │ 4 ┆ 3 ┆ Apple │ │ 5 ┆ 2 ┆ Banana │ │ 6 ┆ 1 ┆ Banana │ └─────┴─────┴────────┘ Get the top 2 rows by column `a` or `b`. >>> df.select( ... pl.all().top_k_by("a", 2).name.suffix("_top_by_a"), ... pl.all().top_k_by("b", 2).name.suffix("_top_by_b"), ... ) shape: (2, 6) ┌────────────┬────────────┬────────────┬────────────┬────────────┬────────────┐ │ a_top_by_a ┆ b_top_by_a ┆ c_top_by_a ┆ a_top_by_b ┆ b_top_by_b ┆ c_top_by_b │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 ┆ str │ ╞════════════╪════════════╪════════════╪════════════╪════════════╪════════════╡ │ 6 ┆ 1 ┆ Banana ┆ 1 ┆ 6 ┆ Apple │ │ 5 ┆ 2 ┆ Banana ┆ 2 ┆ 5 ┆ Orange │ └────────────┴────────────┴────────────┴────────────┴────────────┴────────────┘ Get the top 2 rows by multiple columns with given order. >>> df.select( ... pl.all() ... .top_k_by(["c", "a"], 2, reverse=[False, True]) ... .name.suffix("_by_ca"), ... pl.all() ... .top_k_by(["c", "b"], 2, reverse=[False, True]) ... .name.suffix("_by_cb"), ... ) shape: (2, 6) ┌─────────┬─────────┬─────────┬─────────┬─────────┬─────────┐ │ a_by_ca ┆ b_by_ca ┆ c_by_ca ┆ a_by_cb ┆ b_by_cb ┆ c_by_cb │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 ┆ str │ ╞═════════╪═════════╪═════════╪═════════╪═════════╪═════════╡ │ 2 ┆ 5 ┆ Orange ┆ 2 ┆ 5 ┆ Orange │ │ 5 ┆ 2 ┆ Banana ┆ 6 ┆ 1 ┆ Banana │ └─────────┴─────────┴─────────┴─────────┴─────────┴─────────┘ Get the top 2 rows by column `a` in each group. >>> ( ... df.group_by("c", maintain_order=True) ... .agg(pl.all().top_k_by("a", 2)) ... .explode(pl.all().exclude("c")) ... ) shape: (5, 3) ┌────────┬─────┬─────┐ │ c ┆ a ┆ b │ │ --- ┆ --- ┆ --- │ │ str ┆ i64 ┆ i64 │ ╞════════╪═════╪═════╡ │ Apple ┆ 4 ┆ 3 │ │ Apple ┆ 3 ┆ 4 │ │ Orange ┆ 2 ┆ 5 │ │ Banana ┆ 6 ┆ 1 │ │ Banana ┆ 5 ┆ 2 │ └────────┴─────┴─────┘ """ # noqa: W505 k = parse_into_expression(k) by = parse_into_list_of_expressions(by) reverse = extend_bool(reverse, len(by), "reverse", "by") return self._from_pyexpr(self._pyexpr.top_k_by(by, k=k, reverse=reverse)) def bottom_k(self, k: int | IntoExprColumn = 5) -> Expr: r""" Return the `k` smallest elements. Non-null elements are always preferred over null elements. The output is not guaranteed to be in any particular order, call :func:`sort` after this function if you wish the output to be sorted. This has time complexity: .. math:: O(n) Parameters ---------- k Number of elements to return. See Also -------- top_k top_k_by bottom_k_by Examples -------- >>> df = pl.DataFrame( ... { ... "value": [1, 98, 2, 3, 99, 4], ... } ... ) >>> df.select( ... pl.col("value").top_k().alias("top_k"), ... pl.col("value").bottom_k().alias("bottom_k"), ... ) shape: (5, 2) ┌───────┬──────────┐ │ top_k ┆ bottom_k │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═══════╪══════════╡ │ 4 ┆ 1 │ │ 98 ┆ 98 │ │ 2 ┆ 2 │ │ 3 ┆ 3 │ │ 99 ┆ 4 │ └───────┴──────────┘ """ k = parse_into_expression(k) return self._from_pyexpr(self._pyexpr.bottom_k(k)) @deprecate_renamed_parameter("descending", "reverse", version="1.0.0") def bottom_k_by( self, by: IntoExpr | Iterable[IntoExpr], k: int | IntoExprColumn = 5, *, reverse: bool | Sequence[bool] = False, ) -> Expr: r""" Return the elements corresponding to the `k` smallest elements of the `by` column(s). Non-null elements are always preferred over null elements, regardless of the value of `reverse`. The output is not guaranteed to be in any particular order, call :func:`sort` after this function if you wish the output to be sorted. This has time complexity: .. math:: O(n \log{n}) Parameters ---------- by Column(s) used to determine the smallest elements. Accepts expression input. Strings are parsed as column names. k Number of elements to return. reverse Consider the `k` largest elements of the `by` column(s) (instead of the `k` smallest). This can be specified per column by passing a sequence of booleans. See Also -------- top_k top_k_by bottom_k Examples -------- >>> df = pl.DataFrame( ... { ... "a": [1, 2, 3, 4, 5, 6], ... "b": [6, 5, 4, 3, 2, 1], ... "c": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"], ... } ... ) >>> df shape: (6, 3) ┌─────┬─────┬────────┐ │ a ┆ b ┆ c │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ str │ ╞═════╪═════╪════════╡ │ 1 ┆ 6 ┆ Apple │ │ 2 ┆ 5 ┆ Orange │ │ 3 ┆ 4 ┆ Apple │ │ 4 ┆ 3 ┆ Apple │ │ 5 ┆ 2 ┆ Banana │ │ 6 ┆ 1 ┆ Banana │ └─────┴─────┴────────┘ Get the bottom 2 rows by column `a` or `b`. >>> df.select( ... pl.all().bottom_k_by("a", 2).name.suffix("_btm_by_a"), ... pl.all().bottom_k_by("b", 2).name.suffix("_btm_by_b"), ... ) shape: (2, 6) ┌────────────┬────────────┬────────────┬────────────┬────────────┬────────────┐ │ a_btm_by_a ┆ b_btm_by_a ┆ c_btm_by_a ┆ a_btm_by_b ┆ b_btm_by_b ┆ c_btm_by_b │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 ┆ str │ ╞════════════╪════════════╪════════════╪════════════╪════════════╪════════════╡ │ 1 ┆ 6 ┆ Apple ┆ 6 ┆ 1 ┆ Banana │ │ 2 ┆ 5 ┆ Orange ┆ 5 ┆ 2 ┆ Banana │ └────────────┴────────────┴────────────┴────────────┴────────────┴────────────┘ Get the bottom 2 rows by multiple columns with given order. >>> df.select( ... pl.all() ... .bottom_k_by(["c", "a"], 2, reverse=[False, True]) ... .name.suffix("_by_ca"), ... pl.all() ... .bottom_k_by(["c", "b"], 2, reverse=[False, True]) ... .name.suffix("_by_cb"), ... ) shape: (2, 6) ┌─────────┬─────────┬─────────┬─────────┬─────────┬─────────┐ │ a_by_ca ┆ b_by_ca ┆ c_by_ca ┆ a_by_cb ┆ b_by_cb ┆ c_by_cb │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 ┆ str │ ╞═════════╪═════════╪═════════╪═════════╪═════════╪═════════╡ │ 4 ┆ 3 ┆ Apple ┆ 1 ┆ 6 ┆ Apple │ │ 3 ┆ 4 ┆ Apple ┆ 3 ┆ 4 ┆ Apple │ └─────────┴─────────┴─────────┴─────────┴─────────┴─────────┘ Get the bottom 2 rows by column `a` in each group. >>> ( ... df.group_by("c", maintain_order=True) ... .agg(pl.all().bottom_k_by("a", 2)) ... .explode(pl.all().exclude("c")) ... ) shape: (5, 3) ┌────────┬─────┬─────┐ │ c ┆ a ┆ b │ │ --- ┆ --- ┆ --- │ │ str ┆ i64 ┆ i64 │ ╞════════╪═════╪═════╡ │ Apple ┆ 1 ┆ 6 │ │ Apple ┆ 3 ┆ 4 │ │ Orange ┆ 2 ┆ 5 │ │ Banana ┆ 5 ┆ 2 │ │ Banana ┆ 6 ┆ 1 │ └────────┴─────┴─────┘ """ # noqa: W505 k = parse_into_expression(k) by = parse_into_list_of_expressions(by) reverse = extend_bool(reverse, len(by), "reverse", "by") return self._from_pyexpr(self._pyexpr.bottom_k_by(by, k=k, reverse=reverse)) def arg_sort(self, *, descending: bool = False, nulls_last: bool = False) -> Expr: """ Get the index values that would sort this column. Parameters ---------- descending Sort in descending (descending) order. nulls_last Place null values last instead of first. Returns ------- Expr Expression of data type :class:`UInt32`. See Also -------- Expr.gather: Take values by index. Expr.rank : Get the rank of each row. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [20, 10, 30], ... "b": [1, 2, 3], ... } ... ) >>> df.select(pl.col("a").arg_sort()) shape: (3, 1) ┌─────┐ │ a │ │ --- │ │ u32 │ ╞═════╡ │ 1 │ │ 0 │ │ 2 │ └─────┘ Use gather to apply the arg sort to other columns. >>> df.select(pl.col("b").gather(pl.col("a").arg_sort())) shape: (3, 1) ┌─────┐ │ b │ │ --- │ │ i64 │ ╞═════╡ │ 2 │ │ 1 │ │ 3 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.arg_sort(descending, nulls_last)) def arg_max(self) -> Expr: """ Get the index of the maximal value. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [20, 10, 30], ... } ... ) >>> df.select(pl.col("a").arg_max()) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ u32 │ ╞═════╡ │ 2 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.arg_max()) def arg_min(self) -> Expr: """ Get the index of the minimal value. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [20, 10, 30], ... } ... ) >>> df.select(pl.col("a").arg_min()) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ u32 │ ╞═════╡ │ 1 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.arg_min()) def search_sorted( self, element: IntoExpr | np.ndarray[Any, Any], side: SearchSortedSide = "any" ) -> Expr: """ Find indices where elements should be inserted to maintain order. .. math:: a[i-1] < v <= a[i] Parameters ---------- element Expression or scalar value. side : {'any', 'left', 'right'} If 'any', the index of the first suitable location found is given. If 'left', the index of the leftmost suitable location found is given. If 'right', return the rightmost suitable location found is given. Examples -------- >>> df = pl.DataFrame( ... { ... "values": [1, 2, 3, 5], ... } ... ) >>> df.select( ... [ ... pl.col("values").search_sorted(0).alias("zero"), ... pl.col("values").search_sorted(3).alias("three"), ... pl.col("values").search_sorted(6).alias("six"), ... ] ... ) shape: (1, 3) ┌──────┬───────┬─────┐ │ zero ┆ three ┆ six │ │ --- ┆ --- ┆ --- │ │ u32 ┆ u32 ┆ u32 │ ╞══════╪═══════╪═════╡ │ 0 ┆ 2 ┆ 4 │ └──────┴───────┴─────┘ """ element = parse_into_expression(element, str_as_lit=True, list_as_series=True) # type: ignore[arg-type] return self._from_pyexpr(self._pyexpr.search_sorted(element, side)) def sort_by( self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = False, nulls_last: bool | Sequence[bool] = False, multithreaded: bool = True, maintain_order: bool = False, ) -> Expr: """ Sort this column by the ordering of other columns. When used in a projection/selection context, the whole column is sorted. When used in a group by context, the groups are sorted. Parameters ---------- by Column(s) to sort by. Accepts expression input. Strings are parsed as column names. *more_by Additional columns to sort by, specified as positional arguments. descending Sort in descending order. When sorting by multiple columns, can be specified per column by passing a sequence of booleans. nulls_last Place null values last; can specify a single boolean applying to all columns or a sequence of booleans for per-column control. multithreaded Sort using multiple threads. maintain_order Whether the order should be maintained if elements are equal. Examples -------- Pass a single column name to sort by that column. >>> df = pl.DataFrame( ... { ... "group": ["a", "a", "b", "b"], ... "value1": [1, 3, 4, 2], ... "value2": [8, 7, 6, 5], ... } ... ) >>> df.select(pl.col("group").sort_by("value1")) shape: (4, 1) ┌───────┐ │ group │ │ --- │ │ str │ ╞═══════╡ │ a │ │ b │ │ a │ │ b │ └───────┘ Sorting by expressions is also supported. >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) shape: (4, 1) ┌───────┐ │ group │ │ --- │ │ str │ ╞═══════╡ │ b │ │ a │ │ a │ │ b │ └───────┘ Sort by multiple columns by passing a list of columns. >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) shape: (4, 1) ┌───────┐ │ group │ │ --- │ │ str │ ╞═══════╡ │ b │ │ a │ │ b │ │ a │ └───────┘ Or use positional arguments to sort by multiple columns in the same way. >>> df.select(pl.col("group").sort_by("value1", "value2")) shape: (4, 1) ┌───────┐ │ group │ │ --- │ │ str │ ╞═══════╡ │ a │ │ b │ │ a │ │ b │ └───────┘ When sorting in a group by context, the groups are sorted. >>> df.group_by("group").agg( ... pl.col("value1").sort_by("value2") ... ) # doctest: +IGNORE_RESULT shape: (2, 2) ┌───────┬───────────┐ │ group ┆ value1 │ │ --- ┆ --- │ │ str ┆ list[i64] │ ╞═══════╪═══════════╡ │ a ┆ [3, 1] │ │ b ┆ [2, 4] │ └───────┴───────────┘ Take a single row from each group where a column attains its minimal value within that group. >>> df.group_by("group").agg( ... pl.all().sort_by("value2").first() ... ) # doctest: +IGNORE_RESULT shape: (2, 3) ┌───────┬────────┬────────┐ │ group ┆ value1 ┆ value2 | │ --- ┆ --- ┆ --- │ │ str ┆ i64 ┆ i64 | ╞═══════╪════════╪════════╡ │ a ┆ 3 ┆ 7 | │ b ┆ 2 ┆ 5 | └───────┴────────┴────────┘ """ by = parse_into_list_of_expressions(by, *more_by) descending = extend_bool(descending, len(by), "descending", "by") nulls_last = extend_bool(nulls_last, len(by), "nulls_last", "by") return self._from_pyexpr( self._pyexpr.sort_by( by, descending, nulls_last, multithreaded, maintain_order ) ) def gather( self, indices: int | Sequence[int] | IntoExpr | Series | np.ndarray[Any, Any] ) -> Expr: """ Take values by index. Parameters ---------- indices An expression that leads to a UInt32 dtyped Series. Returns ------- Expr Expression of the same data type. See Also -------- Expr.get : Take a single value Examples -------- >>> df = pl.DataFrame( ... { ... "group": [ ... "one", ... "one", ... "one", ... "two", ... "two", ... "two", ... ], ... "value": [1, 98, 2, 3, 99, 4], ... } ... ) >>> df.group_by("group", maintain_order=True).agg( ... pl.col("value").gather([2, 1]) ... ) shape: (2, 2) ┌───────┬───────────┐ │ group ┆ value │ │ --- ┆ --- │ │ str ┆ list[i64] │ ╞═══════╪═══════════╡ │ one ┆ [2, 98] │ │ two ┆ [4, 99] │ └───────┴───────────┘ """ if (isinstance(indices, Sequence) and not isinstance(indices, str)) or ( _check_for_numpy(indices) and isinstance(indices, np.ndarray) ): indices_lit = F.lit(pl.Series("", indices, dtype=Int64))._pyexpr else: indices_lit = parse_into_expression(indices) # type: ignore[arg-type] return self._from_pyexpr(self._pyexpr.gather(indices_lit)) def get(self, index: int | Expr) -> Expr: """ Return a single value by index. Parameters ---------- index An expression that leads to a UInt32 index. Returns ------- Expr Expression of the same data type. Examples -------- >>> df = pl.DataFrame( ... { ... "group": [ ... "one", ... "one", ... "one", ... "two", ... "two", ... "two", ... ], ... "value": [1, 98, 2, 3, 99, 4], ... } ... ) >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) shape: (2, 2) ┌───────┬───────┐ │ group ┆ value │ │ --- ┆ --- │ │ str ┆ i64 │ ╞═══════╪═══════╡ │ one ┆ 98 │ │ two ┆ 99 │ └───────┴───────┘ """ index_lit = parse_into_expression(index) return self._from_pyexpr(self._pyexpr.get(index_lit)) def shift( self, n: int | IntoExprColumn = 1, *, fill_value: IntoExpr | None = None ) -> Expr: """ Shift values by the given number of indices. Parameters ---------- n Number of indices to shift forward. If a negative value is passed, values are shifted in the opposite direction instead. fill_value Fill the resulting null values with this value. Notes ----- This method is similar to the `LAG` operation in SQL when the value for `n` is positive. With a negative value for `n`, it is similar to `LEAD`. See Also -------- backward_fill forward_fill Examples -------- By default, values are shifted forward by one index. >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) >>> df.with_columns(shift=pl.col("a").shift()) shape: (4, 2) ┌─────┬───────┐ │ a ┆ shift │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪═══════╡ │ 1 ┆ null │ │ 2 ┆ 1 │ │ 3 ┆ 2 │ │ 4 ┆ 3 │ └─────┴───────┘ Pass a negative value to shift in the opposite direction instead. >>> df.with_columns(shift=pl.col("a").shift(-2)) shape: (4, 2) ┌─────┬───────┐ │ a ┆ shift │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪═══════╡ │ 1 ┆ 3 │ │ 2 ┆ 4 │ │ 3 ┆ null │ │ 4 ┆ null │ └─────┴───────┘ Specify `fill_value` to fill the resulting null values. >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) shape: (4, 2) ┌─────┬───────┐ │ a ┆ shift │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪═══════╡ │ 1 ┆ 3 │ │ 2 ┆ 4 │ │ 3 ┆ 100 │ │ 4 ┆ 100 │ └─────┴───────┘ """ if fill_value is not None: fill_value = parse_into_expression(fill_value, str_as_lit=True) n = parse_into_expression(n) return self._from_pyexpr(self._pyexpr.shift(n, fill_value)) def fill_null( self, value: Any | Expr | None = None, strategy: FillNullStrategy | None = None, limit: int | None = None, ) -> Expr: """ Fill null values using the specified value or strategy. To interpolate over null values see interpolate. See the examples below to fill nulls with an expression. Parameters ---------- value Value used to fill null values. strategy : {None, 'forward', 'backward', 'min', 'max', 'mean', 'zero', 'one'} Strategy used to fill null values. limit Number of consecutive null values to fill when using the 'forward' or 'backward' strategy. See Also -------- fill_nan Examples -------- >>> df = pl.DataFrame( ... { ... "a": [1, 2, None], ... "b": [4, None, 6], ... } ... ) >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) shape: (3, 2) ┌──────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞══════╪═════╡ │ 1 ┆ 4 │ │ 2 ┆ 0 │ │ null ┆ 6 │ └──────┴─────┘ >>> df.with_columns(pl.col("b").fill_null(99)) shape: (3, 2) ┌──────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞══════╪═════╡ │ 1 ┆ 4 │ │ 2 ┆ 99 │ │ null ┆ 6 │ └──────┴─────┘ >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) shape: (3, 2) ┌──────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞══════╪═════╡ │ 1 ┆ 4 │ │ 2 ┆ 4 │ │ null ┆ 6 │ └──────┴─────┘ >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) shape: (3, 2) ┌──────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ f64 │ ╞══════╪═════╡ │ 1 ┆ 4.0 │ │ 2 ┆ 5.0 │ │ null ┆ 6.0 │ └──────┴─────┘ >>> df.with_columns(pl.all().fill_null(pl.all().median())) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪═════╡ │ 1.0 ┆ 4.0 │ │ 2.0 ┆ 5.0 │ │ 1.5 ┆ 6.0 │ └─────┴─────┘ """ if value is not None and strategy is not None: msg = "cannot specify both `value` and `strategy`" raise ValueError(msg) elif value is None and strategy is None: msg = "must specify either a fill `value` or `strategy`" raise ValueError(msg) elif strategy not in ("forward", "backward") and limit is not None: msg = "can only specify `limit` when strategy is set to 'backward' or 'forward'" raise ValueError(msg) if value is not None: value = parse_into_expression(value, str_as_lit=True) return self._from_pyexpr(self._pyexpr.fill_null(value)) else: return self._from_pyexpr( self._pyexpr.fill_null_with_strategy(strategy, limit) ) def fill_nan(self, value: int | float | Expr | None) -> Expr: """ Fill floating point NaN value with a fill value. Parameters ---------- value Value used to fill NaN values. Warnings -------- Note that floating point NaNs (Not a Number) are not missing values. To replace missing values, use :func:`fill_null`. See Also -------- fill_null Examples -------- >>> df = pl.DataFrame( ... { ... "a": [1.0, None, float("nan")], ... "b": [4.0, float("nan"), 6], ... } ... ) >>> df.with_columns(pl.col("b").fill_nan(0)) shape: (3, 2) ┌──────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞══════╪═════╡ │ 1.0 ┆ 4.0 │ │ null ┆ 0.0 │ │ NaN ┆ 6.0 │ └──────┴─────┘ """ fill_value = parse_into_expression(value, str_as_lit=True) return self._from_pyexpr(self._pyexpr.fill_nan(fill_value)) def forward_fill(self, limit: int | None = None) -> Expr: """ Fill missing values with the last non-null value. Parameters ---------- limit The number of consecutive null values to forward fill. See Also -------- backward_fill shift Examples -------- >>> df = pl.DataFrame( ... { ... "a": [1, 2, None], ... "b": [4, None, 6], ... } ... ) >>> df.select(pl.all().forward_fill()) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪═════╡ │ 1 ┆ 4 │ │ 2 ┆ 4 │ │ 2 ┆ 6 │ └─────┴─────┘ """ return self._from_pyexpr(self._pyexpr.forward_fill(limit)) def backward_fill(self, limit: int | None = None) -> Expr: """ Fill missing values with the next non-null value. Parameters ---------- limit The number of consecutive null values to backward fill. See Also -------- forward_fill shift Examples -------- >>> df = pl.DataFrame( ... { ... "a": [1, 2, None], ... "b": [4, None, 6], ... "c": [None, None, 2], ... } ... ) >>> df.select(pl.all().backward_fill()) shape: (3, 3) ┌──────┬─────┬─────┐ │ a ┆ b ┆ c │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 │ ╞══════╪═════╪═════╡ │ 1 ┆ 4 ┆ 2 │ │ 2 ┆ 6 ┆ 2 │ │ null ┆ 6 ┆ 2 │ └──────┴─────┴─────┘ >>> df.select(pl.all().backward_fill(limit=1)) shape: (3, 3) ┌──────┬─────┬──────┐ │ a ┆ b ┆ c │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 │ ╞══════╪═════╪══════╡ │ 1 ┆ 4 ┆ null │ │ 2 ┆ 6 ┆ 2 │ │ null ┆ 6 ┆ 2 │ └──────┴─────┴──────┘ """ return self._from_pyexpr(self._pyexpr.backward_fill(limit)) def reverse(self) -> Expr: """ Reverse the selection. Examples -------- >>> df = pl.DataFrame( ... { ... "A": [1, 2, 3, 4, 5], ... "fruits": ["banana", "banana", "apple", "apple", "banana"], ... "B": [5, 4, 3, 2, 1], ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], ... } ... ) >>> df.select( ... [ ... pl.all(), ... pl.all().reverse().name.suffix("_reverse"), ... ] ... ) shape: (5, 8) ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ """ # noqa: W505 return self._from_pyexpr(self._pyexpr.reverse()) def std(self, ddof: int = 1) -> Expr: """ Get standard deviation. Parameters ---------- ddof “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, where N represents the number of elements. By default ddof is 1. Examples -------- >>> df = pl.DataFrame({"a": [-1, 0, 1]}) >>> df.select(pl.col("a").std()) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ 1.0 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.std(ddof)) def var(self, ddof: int = 1) -> Expr: """ Get variance. Parameters ---------- ddof “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, where N represents the number of elements. By default ddof is 1. Examples -------- >>> df = pl.DataFrame({"a": [-1, 0, 1]}) >>> df.select(pl.col("a").var()) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ 1.0 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.var(ddof)) def max(self) -> Expr: """ Get maximum value. Examples -------- >>> df = pl.DataFrame({"a": [-1.0, float("nan"), 1.0]}) >>> df.select(pl.col("a").max()) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ 1.0 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.max()) def min(self) -> Expr: """ Get minimum value. Examples -------- >>> df = pl.DataFrame({"a": [-1.0, float("nan"), 1.0]}) >>> df.select(pl.col("a").min()) shape: (1, 1) ┌──────┐ │ a │ │ --- │ │ f64 │ ╞══════╡ │ -1.0 │ └──────┘ """ return self._from_pyexpr(self._pyexpr.min()) def nan_max(self) -> Expr: """ Get maximum value, but propagate/poison encountered NaN values. This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, whereas polars defaults to ignoring them. Examples -------- >>> df = pl.DataFrame({"a": [0.0, float("nan")]}) >>> df.select(pl.col("a").nan_max()) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ NaN │ └─────┘ """ return self._from_pyexpr(self._pyexpr.nan_max()) def nan_min(self) -> Expr: """ Get minimum value, but propagate/poison encountered NaN values. This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, whereas polars defaults to ignoring them. Examples -------- >>> df = pl.DataFrame({"a": [0.0, float("nan")]}) >>> df.select(pl.col("a").nan_min()) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ NaN │ └─────┘ """ return self._from_pyexpr(self._pyexpr.nan_min()) def sum(self) -> Expr: """ Get sum value. Notes ----- Dtypes in {Int8, UInt8, Int16, UInt16} are cast to Int64 before summing to prevent overflow issues. Examples -------- >>> df = pl.DataFrame({"a": [-1, 0, 1]}) >>> df.select(pl.col("a").sum()) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ i64 │ ╞═════╡ │ 0 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.sum()) def mean(self) -> Expr: """ Get mean value. Examples -------- >>> df = pl.DataFrame({"a": [-1, 0, 1]}) >>> df.select(pl.col("a").mean()) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ 0.0 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.mean()) def median(self) -> Expr: """ Get median value using linear interpolation. Examples -------- >>> df = pl.DataFrame({"a": [-1, 0, 1]}) >>> df.select(pl.col("a").median()) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ 0.0 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.median()) def product(self) -> Expr: """ Compute the product of an expression. Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3]}) >>> df.select(pl.col("a").product()) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ i64 │ ╞═════╡ │ 6 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.product()) def n_unique(self) -> Expr: """ Count unique values. Notes ----- `null` is considered to be a unique value for the purposes of this operation. Examples -------- >>> df = pl.DataFrame({"x": [1, 1, 2, 2, 3], "y": [1, 1, 1, None, None]}) >>> df.select( ... x_unique=pl.col("x").n_unique(), ... y_unique=pl.col("y").n_unique(), ... ) shape: (1, 2) ┌──────────┬──────────┐ │ x_unique ┆ y_unique │ │ --- ┆ --- │ │ u32 ┆ u32 │ ╞══════════╪══════════╡ │ 3 ┆ 2 │ └──────────┴──────────┘ """ return self._from_pyexpr(self._pyexpr.n_unique()) def approx_n_unique(self) -> Expr: """ Approximate count of unique values. This is done using the HyperLogLog++ algorithm for cardinality estimation. Examples -------- >>> df = pl.DataFrame({"n": [1, 1, 2]}) >>> df.select(pl.col("n").approx_n_unique()) shape: (1, 1) ┌─────┐ │ n │ │ --- │ │ u32 │ ╞═════╡ │ 2 │ └─────┘ >>> df = pl.DataFrame({"n": range(1000)}) >>> df.select( ... exact=pl.col("n").n_unique(), ... approx=pl.col("n").approx_n_unique(), ... ) # doctest: +SKIP shape: (1, 2) ┌───────┬────────┐ │ exact ┆ approx │ │ --- ┆ --- │ │ u32 ┆ u32 │ ╞═══════╪════════╡ │ 1000 ┆ 1005 │ └───────┴────────┘ """ return self._from_pyexpr(self._pyexpr.approx_n_unique()) def null_count(self) -> Expr: """ Count null values. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [None, 1, None], ... "b": [10, None, 300], ... "c": [350, 650, 850], ... } ... ) >>> df.select(pl.all().null_count()) shape: (1, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ c │ │ --- ┆ --- ┆ --- │ │ u32 ┆ u32 ┆ u32 │ ╞═════╪═════╪═════╡ │ 2 ┆ 1 ┆ 0 │ └─────┴─────┴─────┘ """ return self._from_pyexpr(self._pyexpr.null_count()) def has_nulls(self) -> Expr: """ Check whether the expression contains one or more null values. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [None, 1, None], ... "b": [10, None, 300], ... "c": [350, 650, 850], ... } ... ) >>> df.select(pl.all().has_nulls()) shape: (1, 3) ┌──────┬──────┬───────┐ │ a ┆ b ┆ c │ │ --- ┆ --- ┆ --- │ │ bool ┆ bool ┆ bool │ ╞══════╪══════╪═══════╡ │ true ┆ true ┆ false │ └──────┴──────┴───────┘ """ return self.null_count() > 0 def arg_unique(self) -> Expr: """ Get index of first unique value. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [8, 9, 10], ... "b": [None, 4, 4], ... } ... ) >>> df.select(pl.col("a").arg_unique()) shape: (3, 1) ┌─────┐ │ a │ │ --- │ │ u32 │ ╞═════╡ │ 0 │ │ 1 │ │ 2 │ └─────┘ >>> df.select(pl.col("b").arg_unique()) shape: (2, 1) ┌─────┐ │ b │ │ --- │ │ u32 │ ╞═════╡ │ 0 │ │ 1 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.arg_unique()) def unique(self, *, maintain_order: bool = False) -> Expr: """ Get unique values of this expression. Parameters ---------- maintain_order Maintain order of data. This requires more work. Examples -------- >>> df = pl.DataFrame({"a": [1, 1, 2]}) >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT shape: (2, 1) ┌─────┐ │ a │ │ --- │ │ i64 │ ╞═════╡ │ 2 │ │ 1 │ └─────┘ >>> df.select(pl.col("a").unique(maintain_order=True)) shape: (2, 1) ┌─────┐ │ a │ │ --- │ │ i64 │ ╞═════╡ │ 1 │ │ 2 │ └─────┘ """ if maintain_order: return self._from_pyexpr(self._pyexpr.unique_stable()) return self._from_pyexpr(self._pyexpr.unique()) def first(self) -> Expr: """ Get the first value. Examples -------- >>> df = pl.DataFrame({"a": [1, 1, 2]}) >>> df.select(pl.col("a").first()) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ i64 │ ╞═════╡ │ 1 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.first()) def last(self) -> Expr: """ Get the last value. Examples -------- >>> df = pl.DataFrame({"a": [1, 3, 2]}) >>> df.select(pl.col("a").last()) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ i64 │ ╞═════╡ │ 2 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.last()) def over( self, partition_by: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, order_by: IntoExpr | Iterable[IntoExpr] | None = None, mapping_strategy: WindowMappingStrategy = "group_to_rows", ) -> Expr: """ Compute expressions over the given groups. This expression is similar to performing a group by aggregation and joining the result back into the original DataFrame. The outcome is similar to how `window functions <https://www.postgresql.org/docs/current/tutorial-window.html>`_ work in PostgreSQL. Parameters ---------- partition_by Column(s) to group by. Accepts expression input. Strings are parsed as column names. *more_exprs Additional columns to group by, specified as positional arguments. order_by: Order the window functions/aggregations with the partitioned groups by the result of the expression passed to `order_by`. mapping_strategy: {'group_to_rows', 'join', 'explode'} - group_to_rows If the aggregation results in multiple values, assign them back to their position in the DataFrame. This can only be done if the group yields the same elements before aggregation as after. - join Join the groups as 'List<group_dtype>' to the row positions. warning: this can be memory intensive. - explode Explodes the grouped data into new rows, similar to the results of `group_by` + `agg` + `explode`. Sorting of the given groups is required if the groups are not part of the window operation for the operation, otherwise the result would not make sense. This operation changes the number of rows. Examples -------- Pass the name of a column to compute the expression over that column. >>> df = pl.DataFrame( ... { ... "a": ["a", "a", "b", "b", "b"], ... "b": [1, 2, 3, 5, 3], ... "c": [5, 4, 3, 2, 1], ... } ... ) >>> df.with_columns(c_max=pl.col("c").max().over("a")) shape: (5, 4) ┌─────┬─────┬─────┬───────┐ │ a ┆ b ┆ c ┆ c_max │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ i64 ┆ i64 ┆ i64 │ ╞═════╪═════╪═════╪═══════╡ │ a ┆ 1 ┆ 5 ┆ 5 │ │ a ┆ 2 ┆ 4 ┆ 5 │ │ b ┆ 3 ┆ 3 ┆ 3 │ │ b ┆ 5 ┆ 2 ┆ 3 │ │ b ┆ 3 ┆ 1 ┆ 3 │ └─────┴─────┴─────┴───────┘ Expression input is also supported. >>> df.with_columns(c_max=pl.col("c").max().over(pl.col("b") // 2)) shape: (5, 4) ┌─────┬─────┬─────┬───────┐ │ a ┆ b ┆ c ┆ c_max │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ i64 ┆ i64 ┆ i64 │ ╞═════╪═════╪═════╪═══════╡ │ a ┆ 1 ┆ 5 ┆ 5 │ │ a ┆ 2 ┆ 4 ┆ 4 │ │ b ┆ 3 ┆ 3 ┆ 4 │ │ b ┆ 5 ┆ 2 ┆ 2 │ │ b ┆ 3 ┆ 1 ┆ 4 │ └─────┴─────┴─────┴───────┘ Group by multiple columns by passing multiple column names or expressions. >>> df.with_columns(c_min=pl.col("c").min().over("a", pl.col("b") % 2)) shape: (5, 4) ┌─────┬─────┬─────┬───────┐ │ a ┆ b ┆ c ┆ c_min │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ i64 ┆ i64 ┆ i64 │ ╞═════╪═════╪═════╪═══════╡ │ a ┆ 1 ┆ 5 ┆ 5 │ │ a ┆ 2 ┆ 4 ┆ 4 │ │ b ┆ 3 ┆ 3 ┆ 1 │ │ b ┆ 5 ┆ 2 ┆ 1 │ │ b ┆ 3 ┆ 1 ┆ 1 │ └─────┴─────┴─────┴───────┘ You can use non-elementwise expressions with `over` too. By default they are evaluated using row-order, but you can specify a different one using `order_by`. >>> from datetime import date >>> df = pl.DataFrame( ... { ... "store_id": ["a", "a", "b", "b"], ... "date": [ ... date(2024, 9, 18), ... date(2024, 9, 17), ... date(2024, 9, 18), ... date(2024, 9, 16), ... ], ... "sales": [7, 9, 8, 10], ... } ... ) >>> df.with_columns( ... cumulative_sales=pl.col("sales") ... .cum_sum() ... .over("store_id", order_by="date") ... ) shape: (4, 4) ┌──────────┬────────────┬───────┬──────────────────┐ │ store_id ┆ date ┆ sales ┆ cumulative_sales │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ date ┆ i64 ┆ i64 │ ╞══════════╪════════════╪═══════╪══════════════════╡ │ a ┆ 2024-09-18 ┆ 7 ┆ 16 │ │ a ┆ 2024-09-17 ┆ 9 ┆ 9 │ │ b ┆ 2024-09-18 ┆ 8 ┆ 18 │ │ b ┆ 2024-09-16 ┆ 10 ┆ 10 │ └──────────┴────────────┴───────┴──────────────────┘ If you don't require that the group order be preserved, then the more performant option is to use `mapping_strategy='explode'` - be careful however to only ever use this in a `select` statement, not a `with_columns` one. >>> window = { ... "partition_by": "store_id", ... "order_by": "date", ... "mapping_strategy": "explode", ... } >>> df.select( ... pl.all().over(**window), ... cumulative_sales=pl.col("sales").cum_sum().over(**window), ... ) shape: (4, 4) ┌──────────┬────────────┬───────┬──────────────────┐ │ store_id ┆ date ┆ sales ┆ cumulative_sales │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ date ┆ i64 ┆ i64 │ ╞══════════╪════════════╪═══════╪══════════════════╡ │ a ┆ 2024-09-17 ┆ 9 ┆ 9 │ │ a ┆ 2024-09-18 ┆ 7 ┆ 16 │ │ b ┆ 2024-09-16 ┆ 10 ┆ 10 │ │ b ┆ 2024-09-18 ┆ 8 ┆ 18 │ └──────────┴────────────┴───────┴──────────────────┘ """ partition_by = parse_into_list_of_expressions(partition_by, *more_exprs) if order_by is not None: order_by = parse_into_list_of_expressions(order_by) return self._from_pyexpr( self._pyexpr.over( partition_by, order_by=order_by, order_by_descending=False, # does not work yet order_by_nulls_last=False, # does not work yet mapping_strategy=mapping_strategy, ) ) def rolling( self, index_column: str, *, period: str | timedelta, offset: str | timedelta | None = None, closed: ClosedInterval = "right", ) -> Expr: """ Create rolling groups based on a temporal or integer column. If you have a time series `<t_0, t_1, ..., t_n>`, then by default the windows created will be * (t_0 - period, t_0] * (t_1 - period, t_1] * ... * (t_n - period, t_n] whereas if you pass a non-default `offset`, then the windows will be * (t_0 + offset, t_0 + offset + period] * (t_1 + offset, t_1 + offset + period] * ... * (t_n + offset, t_n + offset + period] The `period` and `offset` arguments are created either from a timedelta, or by using the following string language: - 1ns (1 nanosecond) - 1us (1 microsecond) - 1ms (1 millisecond) - 1s (1 second) - 1m (1 minute) - 1h (1 hour) - 1d (1 calendar day) - 1w (1 calendar week) - 1mo (1 calendar month) - 1q (1 calendar quarter) - 1y (1 calendar year) - 1i (1 index count) Or combine them: "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year". Parameters ---------- index_column Column used to group based on the time window. Often of type Date/Datetime. This column must be sorted in ascending order. In case of a rolling group by on indices, dtype needs to be one of {UInt32, UInt64, Int32, Int64}. Note that the first three get temporarily cast to Int64, so if performance matters use an Int64 column. period Length of the window - must be non-negative. offset Offset of the window. Default is `-period`. closed : {'right', 'left', 'both', 'none'} Define which sides of the temporal interval are closed (inclusive). Examples -------- >>> dates = [ ... "2020-01-01 13:45:48", ... "2020-01-01 16:42:13", ... "2020-01-01 16:45:09", ... "2020-01-02 18:12:48", ... "2020-01-03 19:45:32", ... "2020-01-08 23:16:43", ... ] >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() ... ) >>> df.with_columns( ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), ... ) shape: (6, 5) ┌─────────────────────┬─────┬───────┬───────┬───────┐ │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ └─────────────────────┴─────┴───────┴───────┴───────┘ """ if offset is None: offset = negate_duration_string(parse_as_duration_string(period)) period = parse_as_duration_string(period) offset = parse_as_duration_string(offset) return self._from_pyexpr( self._pyexpr.rolling(index_column, period, offset, closed) ) def is_unique(self) -> Expr: """ Get mask of unique values. Examples -------- >>> df = pl.DataFrame({"a": [1, 1, 2]}) >>> df.select(pl.col("a").is_unique()) shape: (3, 1) ┌───────┐ │ a │ │ --- │ │ bool │ ╞═══════╡ │ false │ │ false │ │ true │ └───────┘ """ return self._from_pyexpr(self._pyexpr.is_unique()) def is_first_distinct(self) -> Expr: """ Return a boolean mask indicating the first occurrence of each distinct value. Returns ------- Expr Expression of data type :class:`Boolean`. Examples -------- >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) shape: (5, 2) ┌─────┬───────┐ │ a ┆ first │ │ --- ┆ --- │ │ i64 ┆ bool │ ╞═════╪═══════╡ │ 1 ┆ true │ │ 1 ┆ false │ │ 2 ┆ true │ │ 3 ┆ true │ │ 2 ┆ false │ └─────┴───────┘ """ return self._from_pyexpr(self._pyexpr.is_first_distinct()) def is_last_distinct(self) -> Expr: """ Return a boolean mask indicating the last occurrence of each distinct value. Returns ------- Expr Expression of data type :class:`Boolean`. Examples -------- >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) shape: (5, 2) ┌─────┬───────┐ │ a ┆ last │ │ --- ┆ --- │ │ i64 ┆ bool │ ╞═════╪═══════╡ │ 1 ┆ false │ │ 1 ┆ true │ │ 2 ┆ false │ │ 3 ┆ true │ │ 2 ┆ true │ └─────┴───────┘ """ return self._from_pyexpr(self._pyexpr.is_last_distinct()) def is_duplicated(self) -> Expr: """ Return a boolean mask indicating duplicated values. Returns ------- Expr Expression of data type :class:`Boolean`. Examples -------- >>> df = pl.DataFrame({"a": [1, 1, 2]}) >>> df.select(pl.col("a").is_duplicated()) shape: (3, 1) ┌───────┐ │ a │ │ --- │ │ bool │ ╞═══════╡ │ true │ │ true │ │ false │ └───────┘ """ return self._from_pyexpr(self._pyexpr.is_duplicated()) def peak_max(self) -> Expr: """ Get a boolean mask of the local maximum peaks. Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) >>> df.select(pl.col("a").peak_max()) shape: (5, 1) ┌───────┐ │ a │ │ --- │ │ bool │ ╞═══════╡ │ false │ │ false │ │ false │ │ false │ │ true │ └───────┘ """ return self._from_pyexpr(self._pyexpr.peak_max()) def peak_min(self) -> Expr: """ Get a boolean mask of the local minimum peaks. Examples -------- >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) >>> df.select(pl.col("a").peak_min()) shape: (5, 1) ┌───────┐ │ a │ │ --- │ │ bool │ ╞═══════╡ │ false │ │ true │ │ false │ │ true │ │ false │ └───────┘ """ return self._from_pyexpr(self._pyexpr.peak_min()) def quantile( self, quantile: float | Expr, interpolation: RollingInterpolationMethod = "nearest", ) -> Expr: """ Get quantile value. Parameters ---------- quantile Quantile between 0.0 and 1.0. interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'} Interpolation method. Examples -------- >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) >>> df.select(pl.col("a").quantile(0.3)) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ 2.0 │ └─────┘ >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ 2.0 │ └─────┘ >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ 1.0 │ └─────┘ >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ 1.5 │ └─────┘ >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ 1.5 │ └─────┘ """ quantile = parse_into_expression(quantile) return self._from_pyexpr(self._pyexpr.quantile(quantile, interpolation)) @unstable() def cut( self, breaks: Sequence[float], *, labels: Sequence[str] | None = None, left_closed: bool = False, include_breaks: bool = False, ) -> Expr: """ Bin continuous values into discrete categories. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. Parameters ---------- breaks List of unique cut points. labels Names of the categories. The number of labels must be equal to the number of cut points plus one. left_closed Set the intervals to be left-closed instead of right-closed. include_breaks Include a column with the right endpoint of the bin each observation falls in. This will change the data type of the output from a :class:`Categorical` to a :class:`Struct`. Returns ------- Expr Expression of data type :class:`Categorical` if `include_breaks` is set to `False` (default), otherwise an expression of data type :class:`Struct`. See Also -------- qcut Examples -------- Divide a column into three categories. >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) >>> df.with_columns( ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") ... ) shape: (5, 2) ┌─────┬─────┐ │ foo ┆ cut │ │ --- ┆ --- │ │ i64 ┆ cat │ ╞═════╪═════╡ │ -2 ┆ a │ │ -1 ┆ a │ │ 0 ┆ b │ │ 1 ┆ b │ │ 2 ┆ c │ └─────┴─────┘ Add both the category and the breakpoint. >>> df.with_columns( ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") ... ).unnest("cut") shape: (5, 3) ┌─────┬────────────┬────────────┐ │ foo ┆ breakpoint ┆ category │ │ --- ┆ --- ┆ --- │ │ i64 ┆ f64 ┆ cat │ ╞═════╪════════════╪════════════╡ │ -2 ┆ -1.0 ┆ (-inf, -1] │ │ -1 ┆ -1.0 ┆ (-inf, -1] │ │ 0 ┆ 1.0 ┆ (-1, 1] │ │ 1 ┆ 1.0 ┆ (-1, 1] │ │ 2 ┆ inf ┆ (1, inf] │ └─────┴────────────┴────────────┘ """ return self._from_pyexpr( self._pyexpr.cut(breaks, labels, left_closed, include_breaks) ) @unstable() def qcut( self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = None, left_closed: bool = False, allow_duplicates: bool = False, include_breaks: bool = False, ) -> Expr: """ Bin continuous values into discrete categories based on their quantiles. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. Parameters ---------- quantiles Either a list of quantile probabilities between 0 and 1 or a positive integer determining the number of bins with uniform probability. labels Names of the categories. The number of labels must be equal to the number of categories. left_closed Set the intervals to be left-closed instead of right-closed. allow_duplicates If set to `True`, duplicates in the resulting quantiles are dropped, rather than raising a `DuplicateError`. This can happen even with unique probabilities, depending on the data. include_breaks Include a column with the right endpoint of the bin each observation falls in. This will change the data type of the output from a :class:`Categorical` to a :class:`Struct`. Returns ------- Expr Expression of data type :class:`Categorical` if `include_breaks` is set to `False` (default), otherwise an expression of data type :class:`Struct`. See Also -------- cut Examples -------- Divide a column into three categories according to pre-defined quantile probabilities. >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) >>> df.with_columns( ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") ... ) shape: (5, 2) ┌─────┬──────┐ │ foo ┆ qcut │ │ --- ┆ --- │ │ i64 ┆ cat │ ╞═════╪══════╡ │ -2 ┆ a │ │ -1 ┆ a │ │ 0 ┆ b │ │ 1 ┆ b │ │ 2 ┆ c │ └─────┴──────┘ Divide a column into two categories using uniform quantile probabilities. >>> df.with_columns( ... pl.col("foo") ... .qcut(2, labels=["low", "high"], left_closed=True) ... .alias("qcut") ... ) shape: (5, 2) ┌─────┬──────┐ │ foo ┆ qcut │ │ --- ┆ --- │ │ i64 ┆ cat │ ╞═════╪══════╡ │ -2 ┆ low │ │ -1 ┆ low │ │ 0 ┆ high │ │ 1 ┆ high │ │ 2 ┆ high │ └─────┴──────┘ Add both the category and the breakpoint. >>> df.with_columns( ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") ... ).unnest("qcut") shape: (5, 3) ┌─────┬────────────┬────────────┐ │ foo ┆ breakpoint ┆ category │ │ --- ┆ --- ┆ --- │ │ i64 ┆ f64 ┆ cat │ ╞═════╪════════════╪════════════╡ │ -2 ┆ -1.0 ┆ (-inf, -1] │ │ -1 ┆ -1.0 ┆ (-inf, -1] │ │ 0 ┆ 1.0 ┆ (-1, 1] │ │ 1 ┆ 1.0 ┆ (-1, 1] │ │ 2 ┆ inf ┆ (1, inf] │ └─────┴────────────┴────────────┘ """ if isinstance(quantiles, int): pyexpr = self._pyexpr.qcut_uniform( quantiles, labels, left_closed, allow_duplicates, include_breaks ) else: pyexpr = self._pyexpr.qcut( quantiles, labels, left_closed, allow_duplicates, include_breaks ) return self._from_pyexpr(pyexpr) def rle(self) -> Expr: """ Compress the column data using run-length encoding. Run-length encoding (RLE) encodes data by storing each *run* of identical values as a single value and its length. Returns ------- Expr Expression of data type `Struct` with fields `len` of data type `UInt32` and `value` of the original data type. See Also -------- rle_id Examples -------- >>> df = pl.DataFrame({"a": [1, 1, 2, 1, None, 1, 3, 3]}) >>> df.select(pl.col("a").rle()).unnest("a") shape: (6, 2) ┌─────┬───────┐ │ len ┆ value │ │ --- ┆ --- │ │ u32 ┆ i64 │ ╞═════╪═══════╡ │ 2 ┆ 1 │ │ 1 ┆ 2 │ │ 1 ┆ 1 │ │ 1 ┆ null │ │ 1 ┆ 1 │ │ 2 ┆ 3 │ └─────┴───────┘ """ return self._from_pyexpr(self._pyexpr.rle()) def rle_id(self) -> Expr: """ Get a distinct integer ID for each run of identical values. The ID starts at 0 and increases by one each time the value of the column changes. Returns ------- Expr Expression of data type `UInt32`. See Also -------- rle Notes ----- This functionality is especially useful for defining a new group for every time a column's value changes, rather than for every distinct value of that column. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [1, 2, 1, 1, 1], ... "b": ["x", "x", None, "y", "y"], ... } ... ) >>> df.with_columns( ... rle_id_a=pl.col("a").rle_id(), ... rle_id_ab=pl.struct("a", "b").rle_id(), ... ) shape: (5, 4) ┌─────┬──────┬──────────┬───────────┐ │ a ┆ b ┆ rle_id_a ┆ rle_id_ab │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ str ┆ u32 ┆ u32 │ ╞═════╪══════╪══════════╪═══════════╡ │ 1 ┆ x ┆ 0 ┆ 0 │ │ 2 ┆ x ┆ 1 ┆ 1 │ │ 1 ┆ null ┆ 2 ┆ 2 │ │ 1 ┆ y ┆ 2 ┆ 3 │ │ 1 ┆ y ┆ 2 ┆ 3 │ └─────┴──────┴──────────┴───────────┘ """ return self._from_pyexpr(self._pyexpr.rle_id()) def filter( self, *predicates: IntoExprColumn | Iterable[IntoExprColumn], **constraints: Any, ) -> Expr: """ Filter the expression based on one or more predicate expressions. The original order of the remaining elements is preserved. Elements where the filter does not evaluate to True are discarded, including nulls. Mostly useful in an aggregation context. If you want to filter on a DataFrame level, use `LazyFrame.filter`. Parameters ---------- predicates Expression(s) that evaluates to a boolean Series. constraints Column filters; use `name = value` to filter columns by the supplied value. Each constraint will behave the same as `pl.col(name).eq(value)`, and will be implicitly joined with the other filter conditions using `&`. Examples -------- >>> df = pl.DataFrame( ... { ... "group_col": ["g1", "g1", "g2"], ... "b": [1, 2, 3], ... } ... ) >>> df.group_by("group_col").agg( ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), ... ).sort("group_col") shape: (2, 3) ┌───────────┬─────┬─────┐ │ group_col ┆ lt ┆ gte │ │ --- ┆ --- ┆ --- │ │ str ┆ i64 ┆ i64 │ ╞═══════════╪═════╪═════╡ │ g1 ┆ 1 ┆ 2 │ │ g2 ┆ 0 ┆ 3 │ └───────────┴─────┴─────┘ Filter expressions can also take constraints as keyword arguments. >>> df = pl.DataFrame( ... { ... "key": ["a", "a", "a", "a", "b", "b", "b", "b", "b"], ... "n": [1, 2, 2, 3, 1, 3, 3, 2, 3], ... }, ... ) >>> df.group_by("key").agg( ... n_1=pl.col("n").filter(n=1).sum(), ... n_2=pl.col("n").filter(n=2).sum(), ... n_3=pl.col("n").filter(n=3).sum(), ... ).sort(by="key") shape: (2, 4) ┌─────┬─────┬─────┬─────┐ │ key ┆ n_1 ┆ n_2 ┆ n_3 │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ i64 ┆ i64 ┆ i64 │ ╞═════╪═════╪═════╪═════╡ │ a ┆ 1 ┆ 4 ┆ 3 │ │ b ┆ 1 ┆ 2 ┆ 9 │ └─────┴─────┴─────┴─────┘ """ predicate = parse_predicates_constraints_into_expression( *predicates, **constraints ) return self._from_pyexpr(self._pyexpr.filter(predicate)) @deprecate_function("Use `filter` instead.", version="0.20.4") def where(self, predicate: Expr) -> Expr: """ Filter a single column. .. deprecated:: 0.20.4 Use :func:`filter` instead. Alias for :func:`filter`. Parameters ---------- predicate Boolean expression. Examples -------- >>> df = pl.DataFrame( ... { ... "group_col": ["g1", "g1", "g2"], ... "b": [1, 2, 3], ... } ... ) >>> df.group_by("group_col").agg( # doctest: +SKIP ... [ ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), ... ] ... ).sort("group_col") shape: (2, 3) ┌───────────┬─────┬─────┐ │ group_col ┆ lt ┆ gte │ │ --- ┆ --- ┆ --- │ │ str ┆ i64 ┆ i64 │ ╞═══════════╪═════╪═════╡ │ g1 ┆ 1 ┆ 2 │ │ g2 ┆ 0 ┆ 3 │ └───────────┴─────┴─────┘ """ return self.filter(predicate) class _map_batches_wrapper: def __init__( self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None, ) -> None: self.function = function self.return_dtype = return_dtype def __call__(self, *args: Any, **kwargs: Any) -> Any: result = self.function(*args, **kwargs) if _check_for_numpy(result) and isinstance(result, np.ndarray): result = pl.Series(result, dtype=self.return_dtype) return result def map_batches( self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = None, *, agg_list: bool = False, is_elementwise: bool = False, returns_scalar: bool = False, ) -> Expr: """ Apply a custom python function to a whole Series or sequence of Series. The output of this custom function is presumed to be either a Series, or a NumPy array (in which case it will be automatically converted into a Series), or a scalar that will be converted into a Series. If the result is a scalar and you want it to stay as a scalar, pass in ``returns_scalar=True``. If you want to apply a custom function elementwise over single values, see :func:`map_elements`. A reasonable use case for `map` functions is transforming the values represented by an expression using a third-party library. Parameters ---------- function Lambda/function to apply. return_dtype Dtype of the output Series. If not set, the dtype will be inferred based on the first non-null value that is returned by the function. agg_list Aggregate the values of the expression into a list before applying the function. This parameter only works in a group-by context. The function will be invoked only once on a list of groups, rather than once per group. is_elementwise If set to true this can run in the streaming engine, but may yield incorrect results in group-by. Ensure you know what you are doing! returns_scalar If the function returns a scalar, by default it will be wrapped in a list in the output, since the assumption is that the function always returns something Series-like. If you want to keep the result as a scalar, set this argument to True. Warnings -------- If `return_dtype` is not provided, this may lead to unexpected results. We allow this, but it is considered a bug in the user's query. See Also -------- map_elements replace Examples -------- >>> df = pl.DataFrame( ... { ... "sine": [0.0, 1.0, 0.0, -1.0], ... "cosine": [1.0, 0.0, -1.0, 0.0], ... } ... ) >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) shape: (1, 2) ┌──────┬────────┐ │ sine ┆ cosine │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞══════╪════════╡ │ 1 ┆ 0 │ └──────┴────────┘ In a group-by context, the `agg_list` parameter can improve performance if used correctly. The following example has `agg_list` set to `False`, which causes the function to be applied once per group. The input of the function is a Series of type `Int64`. This is less efficient. >>> df = pl.DataFrame( ... { ... "a": [0, 1, 0, 1], ... "b": [1, 2, 3, 4], ... } ... ) >>> df.group_by("a").agg( ... pl.col("b").map_batches(lambda x: x + 2, agg_list=False) ... ) # doctest: +IGNORE_RESULT shape: (2, 2) ┌─────┬───────────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ list[i64] │ ╞═════╪═══════════╡ │ 1 ┆ [4, 6] │ │ 0 ┆ [3, 5] │ └─────┴───────────┘ Using `agg_list=True` would be more efficient. In this example, the input of the function is a Series of type `List(Int64)`. >>> df.group_by("a").agg( ... pl.col("b").map_batches( ... lambda x: x.list.eval(pl.element() + 2), agg_list=True ... ) ... ) # doctest: +IGNORE_RESULT shape: (2, 2) ┌─────┬───────────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ list[i64] │ ╞═════╪═══════════╡ │ 0 ┆ [3, 5] │ │ 1 ┆ [4, 6] │ └─────┴───────────┘ Here's an example of a function that returns a scalar, where we want it to stay as a scalar: >>> df = pl.DataFrame( ... { ... "a": [0, 1, 0, 1], ... "b": [1, 2, 3, 4], ... } ... ) >>> df.group_by("a").agg( ... pl.col("b").map_batches(lambda x: x.max(), returns_scalar=True) ... ) # doctest: +IGNORE_RESULT shape: (2, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪═════╡ │ 1 ┆ 4 │ │ 0 ┆ 3 │ └─────┴─────┘ Call a function that takes multiple arguments by creating a `struct` and referencing its fields inside the function call. >>> df = pl.DataFrame( ... { ... "a": [5, 1, 0, 3], ... "b": [4, 2, 3, 4], ... } ... ) >>> df.with_columns( ... a_times_b=pl.struct("a", "b").map_batches( ... lambda x: np.multiply(x.struct.field("a"), x.struct.field("b")) ... ) ... ) shape: (4, 3) ┌─────┬─────┬───────────┐ │ a ┆ b ┆ a_times_b │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 │ ╞═════╪═════╪═══════════╡ │ 5 ┆ 4 ┆ 20 │ │ 1 ┆ 2 ┆ 2 │ │ 0 ┆ 3 ┆ 0 │ │ 3 ┆ 4 ┆ 12 │ └─────┴─────┴───────────┘ """ if return_dtype is not None: return_dtype = parse_into_dtype(return_dtype) return self._from_pyexpr( self._pyexpr.map_batches( self._map_batches_wrapper(function, return_dtype), return_dtype, agg_list, is_elementwise, returns_scalar, ) ) def map_elements( self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = None, *, skip_nulls: bool = True, pass_name: bool = False, strategy: MapElementsStrategy = "thread_local", returns_scalar: bool = False, ) -> Expr: """ Map a custom/user-defined function (UDF) to each element of a column. .. warning:: This method is much slower than the native expressions API. Only use it if you cannot implement your logic otherwise. Suppose that the function is: `x ↦ sqrt(x)`: - For mapping elements of a series, consider: `pl.col("col_name").sqrt()`. - For mapping inner elements of lists, consider: `pl.col("col_name").list.eval(pl.element().sqrt())`. - For mapping elements of struct fields, consider: `pl.col("col_name").struct.field("field_name").sqrt()`. If you want to replace the original column or field, consider :meth:`.with_columns <polars.DataFrame.with_columns>` and :meth:`.with_fields <polars.Expr.struct.with_fields>`. The UDF is applied to each element of a column. Note that, in a GroupBy context, the column will have been pre-aggregated and so each element will itself be a Series. Therefore, depending on the context, requirements for `function` differ: * Selection Expects `function` to be of type `Callable[[Any], Any]`. Applies a Python function to each individual value in the column. * GroupBy Expects `function` to be of type `Callable[[Series], Any]`. For each group, applies a Python function to the slice of the column corresponding to that group. Parameters ---------- function Lambda/function to map. return_dtype Dtype of the output Series. If not set, the dtype will be inferred based on the first non-null value that is returned by the function. skip_nulls Don't map the function over values that contain nulls (this is faster). pass_name Pass the Series name to the custom function (this is more expensive). returns_scalar If the function passed does a reduction (e.g. sum, min, etc), Polars must be informed of this otherwise the schema might be incorrect. strategy : {'thread_local', 'threading'} The threading strategy to use. - 'thread_local': run the python function on a single thread. - 'threading': run the python function on separate threads. Use with care as this can slow performance. This might only speed up your code if the amount of work per element is significant and the python function releases the GIL (e.g. via calling a c function) .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. Warnings -------- If `return_dtype` is not provided, this may lead to unexpected results. We allow this, but it is considered a bug in the user's query. Notes ----- * Using `map_elements` is strongly discouraged as you will be effectively running python "for" loops, which will be very slow. Wherever possible you should prefer the native expression API to achieve the best performance. * If your function is expensive and you don't want it to be called more than once for a given input, consider applying an `@lru_cache` decorator to it. If your data is suitable you may achieve *significant* speedups. * Window function application using `over` is considered a GroupBy context here, so `map_elements` can be used to map functions over window groups. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [1, 2, 3, 1], ... "b": ["a", "b", "c", "c"], ... } ... ) The function is applied to each element of column `'a'`: >>> df.with_columns( # doctest: +SKIP ... pl.col("a") ... .map_elements(lambda x: x * 2, return_dtype=pl.Int64) ... .alias("a_times_2"), ... ) shape: (4, 3) ┌─────┬─────┬───────────┐ │ a ┆ b ┆ a_times_2 │ │ --- ┆ --- ┆ --- │ │ i64 ┆ str ┆ i64 │ ╞═════╪═════╪═══════════╡ │ 1 ┆ a ┆ 2 │ │ 2 ┆ b ┆ 4 │ │ 3 ┆ c ┆ 6 │ │ 1 ┆ c ┆ 2 │ └─────┴─────┴───────────┘ Tip: it is better to implement this with an expression: >>> df.with_columns( ... (pl.col("a") * 2).alias("a_times_2"), ... ) # doctest: +IGNORE_RESULT In a GroupBy context, each element of the column is itself a Series: >>> ( ... df.lazy().group_by("b").agg(pl.col("a")).collect() ... ) # doctest: +IGNORE_RESULT shape: (3, 2) ┌─────┬───────────┐ │ b ┆ a │ │ --- ┆ --- │ │ str ┆ list[i64] │ ╞═════╪═══════════╡ │ a ┆ [1] │ │ b ┆ [2] │ │ c ┆ [3, 1] │ └─────┴───────────┘ Therefore, from the user's point-of-view, the function is applied per-group: >>> ( ... df.lazy() ... .group_by("b") ... .agg(pl.col("a").map_elements(lambda x: x.sum(), return_dtype=pl.Int64)) ... .collect() ... ) # doctest: +IGNORE_RESULT shape: (3, 2) ┌─────┬─────┐ │ b ┆ a │ │ --- ┆ --- │ │ str ┆ i64 │ ╞═════╪═════╡ │ a ┆ 1 │ │ b ┆ 2 │ │ c ┆ 4 │ └─────┴─────┘ Tip: again, it is better to implement this with an expression: >>> ( ... df.lazy() ... .group_by("b", maintain_order=True) ... .agg(pl.col("a").sum()) ... .collect() ... ) # doctest: +IGNORE_RESULT Window function application using `over` will behave as a GroupBy context, with your function receiving individual window groups: >>> df = pl.DataFrame( ... { ... "key": ["x", "x", "y", "x", "y", "z"], ... "val": [1, 1, 1, 1, 1, 1], ... } ... ) >>> df.with_columns( ... scaled=pl.col("val") ... .map_elements(lambda s: s * len(s), return_dtype=pl.List(pl.Int64)) ... .over("key"), ... ).sort("key") shape: (6, 3) ┌─────┬─────┬────────┐ │ key ┆ val ┆ scaled │ │ --- ┆ --- ┆ --- │ │ str ┆ i64 ┆ i64 │ ╞═════╪═════╪════════╡ │ x ┆ 1 ┆ 3 │ │ x ┆ 1 ┆ 3 │ │ x ┆ 1 ┆ 3 │ │ y ┆ 1 ┆ 2 │ │ y ┆ 1 ┆ 2 │ │ z ┆ 1 ┆ 1 │ └─────┴─────┴────────┘ Note that this function would *also* be better-implemented natively: >>> df.with_columns( ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), ... ).sort("key") # doctest: +IGNORE_RESULT """ if strategy == "threading": issue_unstable_warning( "The 'threading' strategy for `map_elements` is considered unstable." ) # input x: Series of type list containing the group values from polars._utils.udfs import warn_on_inefficient_map root_names = self.meta.root_names() if len(root_names) > 0: warn_on_inefficient_map(function, columns=root_names, map_target="expr") if pass_name: def wrap_f(x: Series) -> Series: # pragma: no cover def inner(s: Series | Any) -> Series: # pragma: no cover if isinstance(s, pl.Series): s = s.alias(x.name) return function(s) with warnings.catch_warnings(): warnings.simplefilter("ignore", PolarsInefficientMapWarning) return x.map_elements( inner, return_dtype=return_dtype, skip_nulls=skip_nulls ) else: def wrap_f(x: Series) -> Series: # pragma: no cover with warnings.catch_warnings(): warnings.simplefilter("ignore", PolarsInefficientMapWarning) return x.map_elements( function, return_dtype=return_dtype, skip_nulls=skip_nulls ) if strategy == "thread_local": return self.map_batches( wrap_f, agg_list=True, return_dtype=return_dtype, returns_scalar=returns_scalar, ) elif strategy == "threading": def wrap_threading(x: Series) -> Series: def get_lazy_promise(df: DataFrame) -> LazyFrame: return df.lazy().select( F.col("x").map_batches( wrap_f, agg_list=True, return_dtype=return_dtype, returns_scalar=returns_scalar, ) ) df = x.to_frame("x") if x.len() == 0: return get_lazy_promise(df).collect().to_series() n_threads = thread_pool_size() chunk_size = x.len() // n_threads remainder = x.len() % n_threads if chunk_size == 0: chunk_sizes = [1 for _ in range(remainder)] else: chunk_sizes = [ chunk_size + 1 if i < remainder else chunk_size for i in range(n_threads) ] # create partitions with LazyFrames # these are promises on a computation partitions = [] b = 0 for step in chunk_sizes: a = b b = b + step partition_df = df[a:b, :] partitions.append(get_lazy_promise(partition_df)) out = [df.to_series() for df in F.collect_all(partitions)] return F.concat(out, rechunk=False) return self.map_batches( wrap_threading, agg_list=True, return_dtype=return_dtype, returns_scalar=returns_scalar, ) else: msg = f"strategy {strategy!r} is not supported" raise ValueError(msg) def flatten(self) -> Expr: """ Flatten a list or string column. Alias for :func:`Expr.list.explode`. Examples -------- >>> df = pl.DataFrame( ... { ... "group": ["a", "b", "b"], ... "values": [[1, 2], [2, 3], [4]], ... } ... ) >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP shape: (2, 2) ┌───────┬───────────┐ │ group ┆ values │ │ --- ┆ --- │ │ str ┆ list[i64] │ ╞═══════╪═══════════╡ │ a ┆ [1, 2] │ │ b ┆ [2, 3, 4] │ └───────┴───────────┘ """ return self._from_pyexpr(self._pyexpr.explode()) def explode(self) -> Expr: """ Explode a list expression. This means that every item is expanded to a new row. Returns ------- Expr Expression with the data type of the list elements. See Also -------- Expr.list.explode : Explode a list column. Examples -------- >>> df = pl.DataFrame( ... { ... "group": ["a", "b"], ... "values": [ ... [1, 2], ... [3, 4], ... ], ... } ... ) >>> df.select(pl.col("values").explode()) shape: (4, 1) ┌────────┐ │ values │ │ --- │ │ i64 │ ╞════════╡ │ 1 │ │ 2 │ │ 3 │ │ 4 │ └────────┘ """ return self._from_pyexpr(self._pyexpr.explode()) def implode(self) -> Expr: """ Aggregate values into a list. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [1, 2, 3], ... "b": [4, 5, 6], ... } ... ) >>> df.select(pl.all().implode()) shape: (1, 2) ┌───────────┬───────────┐ │ a ┆ b │ │ --- ┆ --- │ │ list[i64] ┆ list[i64] │ ╞═══════════╪═══════════╡ │ [1, 2, 3] ┆ [4, 5, 6] │ └───────────┴───────────┘ """ return self._from_pyexpr(self._pyexpr.implode()) def gather_every(self, n: int, offset: int = 0) -> Expr: """ Take every nth value in the Series and return as a new Series. Parameters ---------- n Gather every *n*-th row. offset Starting index. Examples -------- >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) >>> df.select(pl.col("foo").gather_every(3)) shape: (3, 1) ┌─────┐ │ foo │ │ --- │ │ i64 │ ╞═════╡ │ 1 │ │ 4 │ │ 7 │ └─────┘ >>> df.select(pl.col("foo").gather_every(3, offset=1)) shape: (3, 1) ┌─────┐ │ foo │ │ --- │ │ i64 │ ╞═════╡ │ 2 │ │ 5 │ │ 8 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.gather_every(n, offset)) def head(self, n: int | Expr = 10) -> Expr: """ Get the first `n` rows. Parameters ---------- n Number of rows to return. Examples -------- >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) >>> df.select(pl.col("foo").head(3)) shape: (3, 1) ┌─────┐ │ foo │ │ --- │ │ i64 │ ╞═════╡ │ 1 │ │ 2 │ │ 3 │ └─────┘ """ return self.slice(0, n) def tail(self, n: int | Expr = 10) -> Expr: """ Get the last `n` rows. Parameters ---------- n Number of rows to return. Examples -------- >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) >>> df.select(pl.col("foo").tail(3)) shape: (3, 1) ┌─────┐ │ foo │ │ --- │ │ i64 │ ╞═════╡ │ 5 │ │ 6 │ │ 7 │ └─────┘ """ # This cast enables tail with expressions that return unsigned integers, # for which negate otherwise raises InvalidOperationError. offset = -self._from_pyexpr( parse_into_expression(n).cast(Int64, strict=False, wrap_numerical=True) ) return self.slice(offset, n) def limit(self, n: int | Expr = 10) -> Expr: """ Get the first `n` rows (alias for :func:`Expr.head`). Parameters ---------- n Number of rows to return. Examples -------- >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) >>> df.select(pl.col("foo").limit(3)) shape: (3, 1) ┌─────┐ │ foo │ │ --- │ │ i64 │ ╞═════╡ │ 1 │ │ 2 │ │ 3 │ └─────┘ """ return self.head(n) def and_(self, *others: Any) -> Expr: """ Method equivalent of bitwise "and" operator `expr & other & ...`. Parameters ---------- *others One or more integer or boolean expressions to evaluate/combine. Examples -------- >>> df = pl.DataFrame( ... data={ ... "x": [5, 6, 7, 4, 8], ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], ... "z": [-9, 2, -1, 4, 8], ... } ... ) >>> df.select( ... (pl.col("x") >= pl.col("z")) ... .and_( ... pl.col("y") >= pl.col("z"), ... pl.col("y") == pl.col("y"), ... pl.col("z") <= pl.col("x"), ... pl.col("y") != pl.col("x"), ... ) ... .alias("all") ... ) shape: (5, 1) ┌───────┐ │ all │ │ --- │ │ bool │ ╞═══════╡ │ true │ │ true │ │ true │ │ false │ │ false │ └───────┘ """ return reduce(operator.and_, (self, *others)) def or_(self, *others: Any) -> Expr: """ Method equivalent of bitwise "or" operator `expr | other | ...`. Parameters ---------- *others One or more integer or boolean expressions to evaluate/combine. Examples -------- >>> df = pl.DataFrame( ... data={ ... "x": [5, 6, 7, 4, 8], ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], ... "z": [-9, 2, -1, 4, 8], ... } ... ) >>> df.select( ... (pl.col("x") == pl.col("y")) ... .or_( ... pl.col("x") == pl.col("y"), ... pl.col("y") == pl.col("z"), ... pl.col("y").cast(int) == pl.col("z"), ... ) ... .alias("any") ... ) shape: (5, 1) ┌───────┐ │ any │ │ --- │ │ bool │ ╞═══════╡ │ false │ │ true │ │ false │ │ true │ │ false │ └───────┘ """ return reduce(operator.or_, (self,) + others) def eq(self, other: Any) -> Expr: """ Method equivalent of equality operator `expr == other`. Parameters ---------- other A literal or expression value to compare with. Examples -------- >>> df = pl.DataFrame( ... data={ ... "x": [1.0, 2.0, float("nan"), 4.0], ... "y": [2.0, 2.0, float("nan"), 4.0], ... } ... ) >>> df.with_columns( ... pl.col("x").eq(pl.col("y")).alias("x == y"), ... ) shape: (4, 3) ┌─────┬─────┬────────┐ │ x ┆ y ┆ x == y │ │ --- ┆ --- ┆ --- │ │ f64 ┆ f64 ┆ bool │ ╞═════╪═════╪════════╡ │ 1.0 ┆ 2.0 ┆ false │ │ 2.0 ┆ 2.0 ┆ true │ │ NaN ┆ NaN ┆ true │ │ 4.0 ┆ 4.0 ┆ true │ └─────┴─────┴────────┘ """ return self.__eq__(other) def eq_missing(self, other: Any) -> Expr: """ Method equivalent of equality operator `expr == other` where `None == None`. This differs from default `eq` where null values are propagated. Parameters ---------- other A literal or expression value to compare with. Examples -------- >>> df = pl.DataFrame( ... data={ ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], ... } ... ) >>> df.with_columns( ... pl.col("x").eq(pl.col("y")).alias("x eq y"), ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), ... ) shape: (6, 4) ┌──────┬──────┬────────┬────────────────┐ │ x ┆ y ┆ x eq y ┆ x eq_missing y │ │ --- ┆ --- ┆ --- ┆ --- │ │ f64 ┆ f64 ┆ bool ┆ bool │ ╞══════╪══════╪════════╪════════════════╡ │ 1.0 ┆ 2.0 ┆ false ┆ false │ │ 2.0 ┆ 2.0 ┆ true ┆ true │ │ NaN ┆ NaN ┆ true ┆ true │ │ 4.0 ┆ 4.0 ┆ true ┆ true │ │ null ┆ 5.0 ┆ null ┆ false │ │ null ┆ null ┆ null ┆ true │ └──────┴──────┴────────┴────────────────┘ """ other = parse_into_expression(other, str_as_lit=True) return self._from_pyexpr(self._pyexpr.eq_missing(other)) def ge(self, other: Any) -> Expr: """ Method equivalent of "greater than or equal" operator `expr >= other`. Parameters ---------- other A literal or expression value to compare with. Examples -------- >>> df = pl.DataFrame( ... data={ ... "x": [5.0, 4.0, float("nan"), 2.0], ... "y": [5.0, 3.0, float("nan"), 1.0], ... } ... ) >>> df.with_columns( ... pl.col("x").ge(pl.col("y")).alias("x >= y"), ... ) shape: (4, 3) ┌─────┬─────┬────────┐ │ x ┆ y ┆ x >= y │ │ --- ┆ --- ┆ --- │ │ f64 ┆ f64 ┆ bool │ ╞═════╪═════╪════════╡ │ 5.0 ┆ 5.0 ┆ true │ │ 4.0 ┆ 3.0 ┆ true │ │ NaN ┆ NaN ┆ true │ │ 2.0 ┆ 1.0 ┆ true │ └─────┴─────┴────────┘ """ return self.__ge__(other) def gt(self, other: Any) -> Expr: """ Method equivalent of "greater than" operator `expr > other`. Parameters ---------- other A literal or expression value to compare with. Examples -------- >>> df = pl.DataFrame( ... data={ ... "x": [5.0, 4.0, float("nan"), 2.0], ... "y": [5.0, 3.0, float("nan"), 1.0], ... } ... ) >>> df.with_columns( ... pl.col("x").gt(pl.col("y")).alias("x > y"), ... ) shape: (4, 3) ┌─────┬─────┬───────┐ │ x ┆ y ┆ x > y │ │ --- ┆ --- ┆ --- │ │ f64 ┆ f64 ┆ bool │ ╞═════╪═════╪═══════╡ │ 5.0 ┆ 5.0 ┆ false │ │ 4.0 ┆ 3.0 ┆ true │ │ NaN ┆ NaN ┆ false │ │ 2.0 ┆ 1.0 ┆ true │ └─────┴─────┴───────┘ """ return self.__gt__(other) def le(self, other: Any) -> Expr: """ Method equivalent of "less than or equal" operator `expr <= other`. Parameters ---------- other A literal or expression value to compare with. Examples -------- >>> df = pl.DataFrame( ... data={ ... "x": [5.0, 4.0, float("nan"), 0.5], ... "y": [5.0, 3.5, float("nan"), 2.0], ... } ... ) >>> df.with_columns( ... pl.col("x").le(pl.col("y")).alias("x <= y"), ... ) shape: (4, 3) ┌─────┬─────┬────────┐ │ x ┆ y ┆ x <= y │ │ --- ┆ --- ┆ --- │ │ f64 ┆ f64 ┆ bool │ ╞═════╪═════╪════════╡ │ 5.0 ┆ 5.0 ┆ true │ │ 4.0 ┆ 3.5 ┆ false │ │ NaN ┆ NaN ┆ true │ │ 0.5 ┆ 2.0 ┆ true │ └─────┴─────┴────────┘ """ return self.__le__(other) def lt(self, other: Any) -> Expr: """ Method equivalent of "less than" operator `expr < other`. Parameters ---------- other A literal or expression value to compare with. Examples -------- >>> df = pl.DataFrame( ... data={ ... "x": [1.0, 2.0, float("nan"), 3.0], ... "y": [2.0, 2.0, float("nan"), 4.0], ... } ... ) >>> df.with_columns( ... pl.col("x").lt(pl.col("y")).alias("x < y"), ... ) shape: (4, 3) ┌─────┬─────┬───────┐ │ x ┆ y ┆ x < y │ │ --- ┆ --- ┆ --- │ │ f64 ┆ f64 ┆ bool │ ╞═════╪═════╪═══════╡ │ 1.0 ┆ 2.0 ┆ true │ │ 2.0 ┆ 2.0 ┆ false │ │ NaN ┆ NaN ┆ false │ │ 3.0 ┆ 4.0 ┆ true │ └─────┴─────┴───────┘ """ return self.__lt__(other) def ne(self, other: Any) -> Expr: """ Method equivalent of inequality operator `expr != other`. Parameters ---------- other A literal or expression value to compare with. Examples -------- >>> df = pl.DataFrame( ... data={ ... "x": [1.0, 2.0, float("nan"), 4.0], ... "y": [2.0, 2.0, float("nan"), 4.0], ... } ... ) >>> df.with_columns( ... pl.col("x").ne(pl.col("y")).alias("x != y"), ... ) shape: (4, 3) ┌─────┬─────┬────────┐ │ x ┆ y ┆ x != y │ │ --- ┆ --- ┆ --- │ │ f64 ┆ f64 ┆ bool │ ╞═════╪═════╪════════╡ │ 1.0 ┆ 2.0 ┆ true │ │ 2.0 ┆ 2.0 ┆ false │ │ NaN ┆ NaN ┆ false │ │ 4.0 ┆ 4.0 ┆ false │ └─────┴─────┴────────┘ """ return self.__ne__(other) def ne_missing(self, other: Any) -> Expr: """ Method equivalent of equality operator `expr != other` where `None == None`. This differs from default `ne` where null values are propagated. Parameters ---------- other A literal or expression value to compare with. Examples -------- >>> df = pl.DataFrame( ... data={ ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], ... } ... ) >>> df.with_columns( ... pl.col("x").ne(pl.col("y")).alias("x ne y"), ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), ... ) shape: (6, 4) ┌──────┬──────┬────────┬────────────────┐ │ x ┆ y ┆ x ne y ┆ x ne_missing y │ │ --- ┆ --- ┆ --- ┆ --- │ │ f64 ┆ f64 ┆ bool ┆ bool │ ╞══════╪══════╪════════╪════════════════╡ │ 1.0 ┆ 2.0 ┆ true ┆ true │ │ 2.0 ┆ 2.0 ┆ false ┆ false │ │ NaN ┆ NaN ┆ false ┆ false │ │ 4.0 ┆ 4.0 ┆ false ┆ false │ │ null ┆ 5.0 ┆ null ┆ true │ │ null ┆ null ┆ null ┆ false │ └──────┴──────┴────────┴────────────────┘ """ other = parse_into_expression(other, str_as_lit=True) return self._from_pyexpr(self._pyexpr.neq_missing(other)) def add(self, other: Any) -> Expr: """ Method equivalent of addition operator `expr + other`. Parameters ---------- other numeric or string value; accepts expression input. Examples -------- >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) >>> df.with_columns( ... pl.col("x").add(2).alias("x+int"), ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), ... ) shape: (5, 3) ┌─────┬───────┬────────┐ │ x ┆ x+int ┆ x+expr │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 │ ╞═════╪═══════╪════════╡ │ 1 ┆ 3 ┆ 2 │ │ 2 ┆ 4 ┆ 4 │ │ 3 ┆ 5 ┆ 9 │ │ 4 ┆ 6 ┆ 28 │ │ 5 ┆ 7 ┆ 125 │ └─────┴───────┴────────┘ >>> df = pl.DataFrame( ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} ... ) >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) shape: (3, 4) ┌─────┬─────┬─────┬─────┐ │ x ┆ y ┆ z ┆ xyz │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ str ┆ str │ ╞═════╪═════╪═════╪═════╡ │ a ┆ b ┆ c ┆ abc │ │ d ┆ e ┆ f ┆ def │ │ g ┆ h ┆ i ┆ ghi │ └─────┴─────┴─────┴─────┘ """ return self.__add__(other) def floordiv(self, other: Any) -> Expr: """ Method equivalent of integer division operator `expr // other`. Parameters ---------- other Numeric literal or expression value. See Also -------- truediv Examples -------- >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) >>> df.with_columns( ... pl.col("x").truediv(2).alias("x/2"), ... pl.col("x").floordiv(2).alias("x//2"), ... ) shape: (5, 3) ┌─────┬─────┬──────┐ │ x ┆ x/2 ┆ x//2 │ │ --- ┆ --- ┆ --- │ │ i64 ┆ f64 ┆ i64 │ ╞═════╪═════╪══════╡ │ 1 ┆ 0.5 ┆ 0 │ │ 2 ┆ 1.0 ┆ 1 │ │ 3 ┆ 1.5 ┆ 1 │ │ 4 ┆ 2.0 ┆ 2 │ │ 5 ┆ 2.5 ┆ 2 │ └─────┴─────┴──────┘ Note that Polars' `floordiv` is subtly different from Python's floor division. For example, consider 6.0 floor-divided by 0.1. Python gives: >>> 6.0 // 0.1 59.0 because `0.1` is not represented internally as that exact value, but a slightly larger value. So the result of the division is slightly less than 60, meaning the flooring operation returns 59.0. Polars instead first does the floating-point division, resulting in a floating-point value of 60.0, and then performs the flooring operation using :any:`floor`: >>> df = pl.DataFrame({"x": [6.0, 6.03]}) >>> df.with_columns( ... pl.col("x").truediv(0.1).alias("x/0.1"), ... ).with_columns( ... pl.col("x/0.1").floor().alias("x/0.1 floor"), ... ) shape: (2, 3) ┌──────┬───────┬─────────────┐ │ x ┆ x/0.1 ┆ x/0.1 floor │ │ --- ┆ --- ┆ --- │ │ f64 ┆ f64 ┆ f64 │ ╞══════╪═══════╪═════════════╡ │ 6.0 ┆ 60.0 ┆ 60.0 │ │ 6.03 ┆ 60.3 ┆ 60.0 │ └──────┴───────┴─────────────┘ yielding the more intuitive result 60.0. The row with x = 6.03 is included to demonstrate the effect of the flooring operation. `floordiv` combines those two steps to give the same result with one expression: >>> df.with_columns( ... pl.col("x").floordiv(0.1).alias("x//0.1"), ... ) shape: (2, 2) ┌──────┬────────┐ │ x ┆ x//0.1 │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞══════╪════════╡ │ 6.0 ┆ 60.0 │ │ 6.03 ┆ 60.0 │ └──────┴────────┘ """ return self.__floordiv__(other) def mod(self, other: Any) -> Expr: """ Method equivalent of modulus operator `expr % other`. Parameters ---------- other Numeric literal or expression value. Examples -------- >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) shape: (5, 2) ┌─────┬─────┐ │ x ┆ x%2 │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪═════╡ │ 0 ┆ 0 │ │ 1 ┆ 1 │ │ 2 ┆ 0 │ │ 3 ┆ 1 │ │ 4 ┆ 0 │ └─────┴─────┘ """ return self.__mod__(other) def mul(self, other: Any) -> Expr: """ Method equivalent of multiplication operator `expr * other`. Parameters ---------- other Numeric literal or expression value. Examples -------- >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) >>> df.with_columns( ... pl.col("x").mul(2).alias("x*2"), ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), ... ) shape: (5, 3) ┌─────┬─────┬───────────┐ │ x ┆ x*2 ┆ x * xlog2 │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ f64 │ ╞═════╪═════╪═══════════╡ │ 1 ┆ 2 ┆ 0.0 │ │ 2 ┆ 4 ┆ 2.0 │ │ 4 ┆ 8 ┆ 8.0 │ │ 8 ┆ 16 ┆ 24.0 │ │ 16 ┆ 32 ┆ 64.0 │ └─────┴─────┴───────────┘ """ return self.__mul__(other) def sub(self, other: Any) -> Expr: """ Method equivalent of subtraction operator `expr - other`. Parameters ---------- other Numeric literal or expression value. Examples -------- >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) >>> df.with_columns( ... pl.col("x").sub(2).alias("x-2"), ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), ... ) shape: (5, 3) ┌─────┬─────┬────────┐ │ x ┆ x-2 ┆ x-expr │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 │ ╞═════╪═════╪════════╡ │ 0 ┆ -2 ┆ 0 │ │ 1 ┆ -1 ┆ 0 │ │ 2 ┆ 0 ┆ -1 │ │ 3 ┆ 1 ┆ -3 │ │ 4 ┆ 2 ┆ -6 │ └─────┴─────┴────────┘ """ return self.__sub__(other) def neg(self) -> Expr: """ Method equivalent of unary minus operator `-expr`. Examples -------- >>> df = pl.DataFrame({"a": [-1, 0, 2, None]}) >>> df.with_columns(pl.col("a").neg()) shape: (4, 1) ┌──────┐ │ a │ │ --- │ │ i64 │ ╞══════╡ │ 1 │ │ 0 │ │ -2 │ │ null │ └──────┘ """ return self.__neg__() def truediv(self, other: Any) -> Expr: """ Method equivalent of float division operator `expr / other`. Parameters ---------- other Numeric literal or expression value. Notes ----- Zero-division behaviour follows IEEE-754: 0/0: Invalid operation - mathematically undefined, returns NaN. n/0: On finite operands gives an exact infinite result, eg: ±infinity. See Also -------- floordiv Examples -------- >>> df = pl.DataFrame( ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} ... ) >>> df.with_columns( ... pl.col("x").truediv(2).alias("x/2"), ... pl.col("x").truediv(pl.col("y")).alias("x/y"), ... ) shape: (5, 4) ┌─────┬──────┬──────┬───────┐ │ x ┆ y ┆ x/2 ┆ x/y │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ f64 ┆ f64 ┆ f64 │ ╞═════╪══════╪══════╪═══════╡ │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ └─────┴──────┴──────┴───────┘ """ return self.__truediv__(other) def pow(self, exponent: IntoExprColumn | int | float) -> Expr: """ Method equivalent of exponentiation operator `expr ** exponent`. If the exponent is float, the result follows the dtype of exponent. Otherwise, it follows dtype of base. Parameters ---------- exponent Numeric literal or expression exponent value. Examples -------- >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) >>> df.with_columns( ... pl.col("x").pow(3).alias("cube"), ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), ... ) shape: (4, 3) ┌─────┬──────┬────────────┐ │ x ┆ cube ┆ x ** xlog2 │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ f64 │ ╞═════╪══════╪════════════╡ │ 1 ┆ 1 ┆ 1.0 │ │ 2 ┆ 8 ┆ 2.0 │ │ 4 ┆ 64 ┆ 16.0 │ │ 8 ┆ 512 ┆ 512.0 │ └─────┴──────┴────────────┘ Raising an integer to a positive integer results in an integer - in order to raise to a negative integer, you can cast either the base or the exponent to float first: >>> df.with_columns( ... x_squared=pl.col("x").pow(2), ... x_inverse=pl.col("x").pow(-1.0), ... ) shape: (4, 3) ┌─────┬───────────┬───────────┐ │ x ┆ x_squared ┆ x_inverse │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ f64 │ ╞═════╪═══════════╪═══════════╡ │ 1 ┆ 1 ┆ 1.0 │ │ 2 ┆ 4 ┆ 0.5 │ │ 4 ┆ 16 ┆ 0.25 │ │ 8 ┆ 64 ┆ 0.125 │ └─────┴───────────┴───────────┘ """ return self.__pow__(exponent) def xor(self, other: Any) -> Expr: """ Method equivalent of bitwise exclusive-or operator `expr ^ other`. Parameters ---------- other Integer or boolean value; accepts expression input. Examples -------- >>> df = pl.DataFrame( ... {"x": [True, False, True, False], "y": [True, True, False, False]} ... ) >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) shape: (4, 3) ┌───────┬───────┬───────┐ │ x ┆ y ┆ x ^ y │ │ --- ┆ --- ┆ --- │ │ bool ┆ bool ┆ bool │ ╞═══════╪═══════╪═══════╡ │ true ┆ true ┆ false │ │ false ┆ true ┆ true │ │ true ┆ false ┆ true │ │ false ┆ false ┆ false │ └───────┴───────┴───────┘ >>> def binary_string(n: int) -> str: ... return bin(n)[2:].zfill(8) >>> >>> df = pl.DataFrame( ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, ... schema={"x": pl.UInt8, "y": pl.UInt8}, ... ) >>> df.with_columns( ... pl.col("x") ... .map_elements(binary_string, return_dtype=pl.String) ... .alias("bin_x"), ... pl.col("y") ... .map_elements(binary_string, return_dtype=pl.String) ... .alias("bin_y"), ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), ... pl.col("x") ... .xor(pl.col("y")) ... .map_elements(binary_string, return_dtype=pl.String) ... .alias("bin_xor_xy"), ... ) shape: (4, 6) ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ └─────┴─────┴──────────┴──────────┴────────┴────────────┘ """ return self.__xor__(other) def is_in(self, other: Expr | Collection[Any] | Series) -> Expr: """ Check if elements of this expression are present in the other Series. Parameters ---------- other Series or sequence of primitive type. Returns ------- Expr Expression of data type :class:`Boolean`. Examples -------- >>> df = pl.DataFrame( ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} ... ) >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) shape: (3, 3) ┌───────────┬──────────────────┬──────────┐ │ sets ┆ optional_members ┆ contains │ │ --- ┆ --- ┆ --- │ │ list[i64] ┆ i64 ┆ bool │ ╞═══════════╪══════════════════╪══════════╡ │ [1, 2, 3] ┆ 1 ┆ true │ │ [1, 2] ┆ 2 ┆ true │ │ [9, 10] ┆ 3 ┆ false │ └───────────┴──────────────────┴──────────┘ """ if isinstance(other, Collection) and not isinstance(other, str): if isinstance(other, (set, frozenset)): other = list(other) other = F.lit(pl.Series(other))._pyexpr else: other = parse_into_expression(other) return self._from_pyexpr(self._pyexpr.is_in(other)) def repeat_by(self, by: pl.Series | Expr | str | int) -> Expr: """ Repeat the elements in this Series as specified in the given expression. The repeated elements are expanded into a `List`. Parameters ---------- by Numeric column that determines how often the values will be repeated. The column will be coerced to UInt32. Give this dtype to make the coercion a no-op. Returns ------- Expr Expression of data type :class:`List`, where the inner data type is equal to the original data type. Examples -------- >>> df = pl.DataFrame( ... { ... "a": ["x", "y", "z"], ... "n": [1, 2, 3], ... } ... ) >>> df.select(pl.col("a").repeat_by("n")) shape: (3, 1) ┌─────────────────┐ │ a │ │ --- │ │ list[str] │ ╞═════════════════╡ │ ["x"] │ │ ["y", "y"] │ │ ["z", "z", "z"] │ └─────────────────┘ """ by = parse_into_expression(by) return self._from_pyexpr(self._pyexpr.repeat_by(by)) def is_between( self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = "both", ) -> Expr: """ Check if this expression is between the given lower and upper bounds. Parameters ---------- lower_bound Lower bound value. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as literals. upper_bound Upper bound value. Accepts expression input. Strings are parsed as column names, other non-expression inputs are parsed as literals. closed : {'both', 'left', 'right', 'none'} Define which sides of the interval are closed (inclusive). Notes ----- If the value of the `lower_bound` is greater than that of the `upper_bound` then the result will be False, as no value can satisfy the condition. Returns ------- Expr Expression of data type :class:`Boolean`. Examples -------- >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) shape: (5, 2) ┌─────┬────────────┐ │ num ┆ is_between │ │ --- ┆ --- │ │ i64 ┆ bool │ ╞═════╪════════════╡ │ 1 ┆ false │ │ 2 ┆ true │ │ 3 ┆ true │ │ 4 ┆ true │ │ 5 ┆ false │ └─────┴────────────┘ Use the `closed` argument to include or exclude the values at the bounds: >>> df.with_columns( ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") ... ) shape: (5, 2) ┌─────┬────────────┐ │ num ┆ is_between │ │ --- ┆ --- │ │ i64 ┆ bool │ ╞═════╪════════════╡ │ 1 ┆ false │ │ 2 ┆ true │ │ 3 ┆ true │ │ 4 ┆ false │ │ 5 ┆ false │ └─────┴────────────┘ You can also use strings as well as numeric/temporal values (note: ensure that string literals are wrapped with `lit` so as not to conflate them with column names): >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) >>> df.with_columns( ... pl.col("a") ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") ... .alias("is_between") ... ) shape: (5, 2) ┌─────┬────────────┐ │ a ┆ is_between │ │ --- ┆ --- │ │ str ┆ bool │ ╞═════╪════════════╡ │ a ┆ true │ │ b ┆ true │ │ c ┆ true │ │ d ┆ false │ │ e ┆ false │ └─────┴────────────┘ Use column expressions as lower/upper bounds, comparing to a literal value: >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [5, 4, 3, 2, 1]}) >>> df.with_columns( ... pl.lit(3).is_between(pl.col("a"), pl.col("b")).alias("between_ab") ... ) shape: (5, 3) ┌─────┬─────┬────────────┐ │ a ┆ b ┆ between_ab │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ bool │ ╞═════╪═════╪════════════╡ │ 1 ┆ 5 ┆ true │ │ 2 ┆ 4 ┆ true │ │ 3 ┆ 3 ┆ true │ │ 4 ┆ 2 ┆ false │ │ 5 ┆ 1 ┆ false │ └─────┴─────┴────────────┘ """ lower_bound = parse_into_expression(lower_bound) upper_bound = parse_into_expression(upper_bound) return self._from_pyexpr( self._pyexpr.is_between(lower_bound, upper_bound, closed) ) def hash( self, seed: int = 0, seed_1: int | None = None, seed_2: int | None = None, seed_3: int | None = None, ) -> Expr: """ Hash the elements in the selection. The hash value is of type `UInt64`. Parameters ---------- seed Random seed parameter. Defaults to 0. seed_1 Random seed parameter. Defaults to `seed` if not set. seed_2 Random seed parameter. Defaults to `seed` if not set. seed_3 Random seed parameter. Defaults to `seed` if not set. Notes ----- This implementation of `hash` does not guarantee stable results across different Polars versions. Its stability is only guaranteed within a single version. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [1, 2, None], ... "b": ["x", None, "z"], ... } ... ) >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT shape: (3, 2) ┌──────────────────────┬──────────────────────┐ │ a ┆ b │ │ --- ┆ --- │ │ u64 ┆ u64 │ ╞══════════════════════╪══════════════════════╡ │ 9774092659964970114 ┆ 13614470193936745724 │ │ 1101441246220388612 ┆ 11638928888656214026 │ │ 11638928888656214026 ┆ 13382926553367784577 │ └──────────────────────┴──────────────────────┘ """ k0 = seed k1 = seed_1 if seed_1 is not None else seed k2 = seed_2 if seed_2 is not None else seed k3 = seed_3 if seed_3 is not None else seed return self._from_pyexpr(self._pyexpr.hash(k0, k1, k2, k3)) def reinterpret(self, *, signed: bool = True) -> Expr: """ Reinterpret the underlying bits as a signed/unsigned integer. This operation is only allowed for 64bit integers. For lower bits integers, you can safely use that cast operation. Parameters ---------- signed If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. Examples -------- >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) >>> df = pl.DataFrame([s]) >>> df.select( ... [ ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), ... pl.col("a").alias("original"), ... ] ... ) shape: (3, 2) ┌───────────────┬──────────┐ │ reinterpreted ┆ original │ │ --- ┆ --- │ │ i64 ┆ u64 │ ╞═══════════════╪══════════╡ │ 1 ┆ 1 │ │ 1 ┆ 1 │ │ 2 ┆ 2 │ └───────────────┴──────────┘ """ return self._from_pyexpr(self._pyexpr.reinterpret(signed)) def inspect(self, fmt: str = "{}") -> Expr: """ Print the value that this expression evaluates to and pass on the value. Examples -------- >>> df = pl.DataFrame({"foo": [1, 1, 2]}) >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) value is: shape: (3,) Series: 'foo' [i64] [ 1 2 4 ] shape: (3, 1) ┌─────┐ │ bar │ │ --- │ │ i64 │ ╞═════╡ │ 1 │ │ 2 │ │ 4 │ └─────┘ """ def inspect(s: Series) -> Series: # pragma: no cover print(fmt.format(s)) return s return self.map_batches(inspect, return_dtype=None, agg_list=True) def interpolate(self, method: InterpolationMethod = "linear") -> Expr: """ Fill null values using interpolation. Parameters ---------- method : {'linear', 'nearest'} Interpolation method. Examples -------- Fill null values using linear interpolation. >>> df = pl.DataFrame( ... { ... "a": [1, None, 3], ... "b": [1.0, float("nan"), 3.0], ... } ... ) >>> df.select(pl.all().interpolate()) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪═════╡ │ 1.0 ┆ 1.0 │ │ 2.0 ┆ NaN │ │ 3.0 ┆ 3.0 │ └─────┴─────┘ Fill null values using nearest interpolation. >>> df.select(pl.all().interpolate("nearest")) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ f64 │ ╞═════╪═════╡ │ 1 ┆ 1.0 │ │ 3 ┆ NaN │ │ 3 ┆ 3.0 │ └─────┴─────┘ Regrid data to a new grid. >>> df_original_grid = pl.DataFrame( ... { ... "grid_points": [1, 3, 10], ... "values": [2.0, 6.0, 20.0], ... } ... ) # Interpolate from this to the new grid >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) >>> df_new_grid.join( ... df_original_grid, on="grid_points", how="left", coalesce=True ... ).with_columns(pl.col("values").interpolate()) shape: (10, 2) ┌─────────────┬────────┐ │ grid_points ┆ values │ │ --- ┆ --- │ │ i64 ┆ f64 │ ╞═════════════╪════════╡ │ 1 ┆ 2.0 │ │ 2 ┆ 4.0 │ │ 3 ┆ 6.0 │ │ 4 ┆ 8.0 │ │ 5 ┆ 10.0 │ │ 6 ┆ 12.0 │ │ 7 ┆ 14.0 │ │ 8 ┆ 16.0 │ │ 9 ┆ 18.0 │ │ 10 ┆ 20.0 │ └─────────────┴────────┘ """ return self._from_pyexpr(self._pyexpr.interpolate(method)) def interpolate_by(self, by: IntoExpr) -> Expr: """ Fill null values using interpolation based on another column. Parameters ---------- by Column to interpolate values based on. Examples -------- Fill null values using linear interpolation. >>> df = pl.DataFrame( ... { ... "a": [1, None, None, 3], ... "b": [1, 2, 7, 8], ... } ... ) >>> df.with_columns(a_interpolated=pl.col("a").interpolate_by("b")) shape: (4, 3) ┌──────┬─────┬────────────────┐ │ a ┆ b ┆ a_interpolated │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ f64 │ ╞══════╪═════╪════════════════╡ │ 1 ┆ 1 ┆ 1.0 │ │ null ┆ 2 ┆ 1.285714 │ │ null ┆ 7 ┆ 2.714286 │ │ 3 ┆ 8 ┆ 3.0 │ └──────┴─────┴────────────────┘ """ by = parse_into_expression(by) return self._from_pyexpr(self._pyexpr.interpolate_by(by)) @unstable() def rolling_min_by( self, by: IntoExpr, window_size: timedelta | str, *, min_periods: int = 1, closed: ClosedInterval = "right", ) -> Expr: """ Apply a rolling min based on another column. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. Given a `by` column `<t_0, t_1, ..., t_n>`, then `closed="right"` (the default) means the windows will be: - (t_0 - window_size, t_0] - (t_1 - window_size, t_1] - ... - (t_n - window_size, t_n] Parameters ---------- by This column must be of dtype Datetime or Date. window_size The length of the window. Can be a dynamic temporal size indicated by a timedelta or the following string language: - 1ns (1 nanosecond) - 1us (1 microsecond) - 1ms (1 millisecond) - 1s (1 second) - 1m (1 minute) - 1h (1 hour) - 1d (1 calendar day) - 1w (1 calendar week) - 1mo (1 calendar month) - 1q (1 calendar quarter) - 1y (1 calendar year) - 1i (1 index count) By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year". min_periods The number of values in the window that should be non-null before computing a result. closed : {'left', 'right', 'both', 'none'} Define which sides of the temporal interval are closed (inclusive), defaults to `'right'`. Notes ----- If you want to compute multiple aggregation statistics over the same dynamic window, consider using `rolling` - this method can cache the window size computation. Examples -------- Create a DataFrame with a datetime column and a row number column >>> from datetime import timedelta, datetime >>> start = datetime(2001, 1, 1) >>> stop = datetime(2001, 1, 2) >>> df_temporal = pl.DataFrame( ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} ... ).with_row_index() >>> df_temporal shape: (25, 2) ┌───────┬─────────────────────┐ │ index ┆ date │ │ --- ┆ --- │ │ u32 ┆ datetime[μs] │ ╞═══════╪═════════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 │ │ 1 ┆ 2001-01-01 01:00:00 │ │ 2 ┆ 2001-01-01 02:00:00 │ │ 3 ┆ 2001-01-01 03:00:00 │ │ 4 ┆ 2001-01-01 04:00:00 │ │ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 │ │ 21 ┆ 2001-01-01 21:00:00 │ │ 22 ┆ 2001-01-01 22:00:00 │ │ 23 ┆ 2001-01-01 23:00:00 │ │ 24 ┆ 2001-01-02 00:00:00 │ └───────┴─────────────────────┘ Compute the rolling min with the temporal windows closed on the right (default) >>> df_temporal.with_columns( ... rolling_row_min=pl.col("index").rolling_min_by("date", window_size="2h") ... ) shape: (25, 3) ┌───────┬─────────────────────┬─────────────────┐ │ index ┆ date ┆ rolling_row_min │ │ --- ┆ --- ┆ --- │ │ u32 ┆ datetime[μs] ┆ u32 │ ╞═══════╪═════════════════════╪═════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ │ 4 ┆ 2001-01-01 04:00:00 ┆ 3 │ │ … ┆ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 ┆ 19 │ │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ └───────┴─────────────────────┴─────────────────┘ """ window_size = _prepare_rolling_by_window_args(window_size) by = parse_into_expression(by) return self._from_pyexpr( self._pyexpr.rolling_min_by(by, window_size, min_periods, closed) ) @unstable() def rolling_max_by( self, by: IntoExpr, window_size: timedelta | str, *, min_periods: int = 1, closed: ClosedInterval = "right", ) -> Expr: """ Apply a rolling max based on another column. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. Given a `by` column `<t_0, t_1, ..., t_n>`, then `closed="right"` (the default) means the windows will be: - (t_0 - window_size, t_0] - (t_1 - window_size, t_1] - ... - (t_n - window_size, t_n] Parameters ---------- by This column must be of dtype Datetime or Date. window_size The length of the window. Can be a dynamic temporal size indicated by a timedelta or the following string language: - 1ns (1 nanosecond) - 1us (1 microsecond) - 1ms (1 millisecond) - 1s (1 second) - 1m (1 minute) - 1h (1 hour) - 1d (1 calendar day) - 1w (1 calendar week) - 1mo (1 calendar month) - 1q (1 calendar quarter) - 1y (1 calendar year) - 1i (1 index count) By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year". min_periods The number of values in the window that should be non-null before computing a result. closed : {'left', 'right', 'both', 'none'} Define which sides of the temporal interval are closed (inclusive), defaults to `'right'`. Notes ----- If you want to compute multiple aggregation statistics over the same dynamic window, consider using `rolling` - this method can cache the window size computation. Examples -------- Create a DataFrame with a datetime column and a row number column >>> from datetime import timedelta, datetime >>> start = datetime(2001, 1, 1) >>> stop = datetime(2001, 1, 2) >>> df_temporal = pl.DataFrame( ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} ... ).with_row_index() >>> df_temporal shape: (25, 2) ┌───────┬─────────────────────┐ │ index ┆ date │ │ --- ┆ --- │ │ u32 ┆ datetime[μs] │ ╞═══════╪═════════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 │ │ 1 ┆ 2001-01-01 01:00:00 │ │ 2 ┆ 2001-01-01 02:00:00 │ │ 3 ┆ 2001-01-01 03:00:00 │ │ 4 ┆ 2001-01-01 04:00:00 │ │ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 │ │ 21 ┆ 2001-01-01 21:00:00 │ │ 22 ┆ 2001-01-01 22:00:00 │ │ 23 ┆ 2001-01-01 23:00:00 │ │ 24 ┆ 2001-01-02 00:00:00 │ └───────┴─────────────────────┘ Compute the rolling max with the temporal windows closed on the right (default) >>> df_temporal.with_columns( ... rolling_row_max=pl.col("index").rolling_max_by("date", window_size="2h") ... ) shape: (25, 3) ┌───────┬─────────────────────┬─────────────────┐ │ index ┆ date ┆ rolling_row_max │ │ --- ┆ --- ┆ --- │ │ u32 ┆ datetime[μs] ┆ u32 │ ╞═══════╪═════════════════════╪═════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ │ 4 ┆ 2001-01-01 04:00:00 ┆ 4 │ │ … ┆ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 ┆ 20 │ │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ └───────┴─────────────────────┴─────────────────┘ Compute the rolling max with the closure of windows on both sides >>> df_temporal.with_columns( ... rolling_row_max=pl.col("index").rolling_max_by( ... "date", window_size="2h", closed="both" ... ) ... ) shape: (25, 3) ┌───────┬─────────────────────┬─────────────────┐ │ index ┆ date ┆ rolling_row_max │ │ --- ┆ --- ┆ --- │ │ u32 ┆ datetime[μs] ┆ u32 │ ╞═══════╪═════════════════════╪═════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ │ 4 ┆ 2001-01-01 04:00:00 ┆ 4 │ │ … ┆ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 ┆ 20 │ │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ └───────┴─────────────────────┴─────────────────┘ """ window_size = _prepare_rolling_by_window_args(window_size) by = parse_into_expression(by) return self._from_pyexpr( self._pyexpr.rolling_max_by(by, window_size, min_periods, closed) ) @unstable() def rolling_mean_by( self, by: IntoExpr, window_size: timedelta | str, *, min_periods: int = 1, closed: ClosedInterval = "right", ) -> Expr: """ Apply a rolling mean based on another column. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. Given a `by` column `<t_0, t_1, ..., t_n>`, then `closed="right"` (the default) means the windows will be: - (t_0 - window_size, t_0] - (t_1 - window_size, t_1] - ... - (t_n - window_size, t_n] Parameters ---------- by This column must be of dtype Datetime or Date. window_size The length of the window. Can be a dynamic temporal size indicated by a timedelta or the following string language: - 1ns (1 nanosecond) - 1us (1 microsecond) - 1ms (1 millisecond) - 1s (1 second) - 1m (1 minute) - 1h (1 hour) - 1d (1 calendar day) - 1w (1 calendar week) - 1mo (1 calendar month) - 1q (1 calendar quarter) - 1y (1 calendar year) - 1i (1 index count) By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year". min_periods The number of values in the window that should be non-null before computing a result. closed : {'left', 'right', 'both', 'none'} Define which sides of the temporal interval are closed (inclusive), defaults to `'right'`. Notes ----- If you want to compute multiple aggregation statistics over the same dynamic window, consider using `rolling` - this method can cache the window size computation. Examples -------- Create a DataFrame with a datetime column and a row number column >>> from datetime import timedelta, datetime >>> start = datetime(2001, 1, 1) >>> stop = datetime(2001, 1, 2) >>> df_temporal = pl.DataFrame( ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} ... ).with_row_index() >>> df_temporal shape: (25, 2) ┌───────┬─────────────────────┐ │ index ┆ date │ │ --- ┆ --- │ │ u32 ┆ datetime[μs] │ ╞═══════╪═════════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 │ │ 1 ┆ 2001-01-01 01:00:00 │ │ 2 ┆ 2001-01-01 02:00:00 │ │ 3 ┆ 2001-01-01 03:00:00 │ │ 4 ┆ 2001-01-01 04:00:00 │ │ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 │ │ 21 ┆ 2001-01-01 21:00:00 │ │ 22 ┆ 2001-01-01 22:00:00 │ │ 23 ┆ 2001-01-01 23:00:00 │ │ 24 ┆ 2001-01-02 00:00:00 │ └───────┴─────────────────────┘ Compute the rolling mean with the temporal windows closed on the right (default) >>> df_temporal.with_columns( ... rolling_row_mean=pl.col("index").rolling_mean_by( ... "date", window_size="2h" ... ) ... ) shape: (25, 3) ┌───────┬─────────────────────┬──────────────────┐ │ index ┆ date ┆ rolling_row_mean │ │ --- ┆ --- ┆ --- │ │ u32 ┆ datetime[μs] ┆ f64 │ ╞═══════╪═════════════════════╪══════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.5 │ │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.5 │ │ 4 ┆ 2001-01-01 04:00:00 ┆ 3.5 │ │ … ┆ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 ┆ 19.5 │ │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.5 │ │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.5 │ │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.5 │ │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.5 │ └───────┴─────────────────────┴──────────────────┘ Compute the rolling mean with the closure of windows on both sides >>> df_temporal.with_columns( ... rolling_row_mean=pl.col("index").rolling_mean_by( ... "date", window_size="2h", closed="both" ... ) ... ) shape: (25, 3) ┌───────┬─────────────────────┬──────────────────┐ │ index ┆ date ┆ rolling_row_mean │ │ --- ┆ --- ┆ --- │ │ u32 ┆ datetime[μs] ┆ f64 │ ╞═══════╪═════════════════════╪══════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ │ 4 ┆ 2001-01-01 04:00:00 ┆ 3.0 │ │ … ┆ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 ┆ 19.0 │ │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ └───────┴─────────────────────┴──────────────────┘ """ window_size = _prepare_rolling_by_window_args(window_size) by = parse_into_expression(by) return self._from_pyexpr( self._pyexpr.rolling_mean_by( by, window_size, min_periods, closed, ) ) @unstable() def rolling_sum_by( self, by: IntoExpr, window_size: timedelta | str, *, min_periods: int = 1, closed: ClosedInterval = "right", ) -> Expr: """ Apply a rolling sum based on another column. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. Given a `by` column `<t_0, t_1, ..., t_n>`, then `closed="right"` (the default) means the windows will be: - (t_0 - window_size, t_0] - (t_1 - window_size, t_1] - ... - (t_n - window_size, t_n] Parameters ---------- window_size The length of the window. Can be a dynamic temporal size indicated by a timedelta or the following string language: - 1ns (1 nanosecond) - 1us (1 microsecond) - 1ms (1 millisecond) - 1s (1 second) - 1m (1 minute) - 1h (1 hour) - 1d (1 calendar day) - 1w (1 calendar week) - 1mo (1 calendar month) - 1q (1 calendar quarter) - 1y (1 calendar year) - 1i (1 index count) By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year". min_periods The number of values in the window that should be non-null before computing a result. by This column must of dtype `{Date, Datetime}` closed : {'left', 'right', 'both', 'none'} Define which sides of the temporal interval are closed (inclusive), defaults to `'right'`. Notes ----- If you want to compute multiple aggregation statistics over the same dynamic window, consider using `rolling` - this method can cache the window size computation. Examples -------- Create a DataFrame with a datetime column and a row number column >>> from datetime import timedelta, datetime >>> start = datetime(2001, 1, 1) >>> stop = datetime(2001, 1, 2) >>> df_temporal = pl.DataFrame( ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} ... ).with_row_index() >>> df_temporal shape: (25, 2) ┌───────┬─────────────────────┐ │ index ┆ date │ │ --- ┆ --- │ │ u32 ┆ datetime[μs] │ ╞═══════╪═════════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 │ │ 1 ┆ 2001-01-01 01:00:00 │ │ 2 ┆ 2001-01-01 02:00:00 │ │ 3 ┆ 2001-01-01 03:00:00 │ │ 4 ┆ 2001-01-01 04:00:00 │ │ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 │ │ 21 ┆ 2001-01-01 21:00:00 │ │ 22 ┆ 2001-01-01 22:00:00 │ │ 23 ┆ 2001-01-01 23:00:00 │ │ 24 ┆ 2001-01-02 00:00:00 │ └───────┴─────────────────────┘ Compute the rolling sum with the temporal windows closed on the right (default) >>> df_temporal.with_columns( ... rolling_row_sum=pl.col("index").rolling_sum_by("date", window_size="2h") ... ) shape: (25, 3) ┌───────┬─────────────────────┬─────────────────┐ │ index ┆ date ┆ rolling_row_sum │ │ --- ┆ --- ┆ --- │ │ u32 ┆ datetime[μs] ┆ u32 │ ╞═══════╪═════════════════════╪═════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ │ 3 ┆ 2001-01-01 03:00:00 ┆ 5 │ │ 4 ┆ 2001-01-01 04:00:00 ┆ 7 │ │ … ┆ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 ┆ 39 │ │ 21 ┆ 2001-01-01 21:00:00 ┆ 41 │ │ 22 ┆ 2001-01-01 22:00:00 ┆ 43 │ │ 23 ┆ 2001-01-01 23:00:00 ┆ 45 │ │ 24 ┆ 2001-01-02 00:00:00 ┆ 47 │ └───────┴─────────────────────┴─────────────────┘ Compute the rolling sum with the closure of windows on both sides >>> df_temporal.with_columns( ... rolling_row_sum=pl.col("index").rolling_sum_by( ... "date", window_size="2h", closed="both" ... ) ... ) shape: (25, 3) ┌───────┬─────────────────────┬─────────────────┐ │ index ┆ date ┆ rolling_row_sum │ │ --- ┆ --- ┆ --- │ │ u32 ┆ datetime[μs] ┆ u32 │ ╞═══════╪═════════════════════╪═════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ │ 4 ┆ 2001-01-01 04:00:00 ┆ 9 │ │ … ┆ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 ┆ 57 │ │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ └───────┴─────────────────────┴─────────────────┘ """ window_size = _prepare_rolling_by_window_args(window_size) by = parse_into_expression(by) return self._from_pyexpr( self._pyexpr.rolling_sum_by(by, window_size, min_periods, closed) ) @unstable() def rolling_std_by( self, by: IntoExpr, window_size: timedelta | str, *, min_periods: int = 1, closed: ClosedInterval = "right", ddof: int = 1, ) -> Expr: """ Compute a rolling standard deviation based on another column. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. Given a `by` column `<t_0, t_1, ..., t_n>`, then `closed="right"` (the default) means the windows will be: - (t_0 - window_size, t_0] - (t_1 - window_size, t_1] - ... - (t_n - window_size, t_n] Parameters ---------- by This column must be of dtype Datetime or Date. window_size The length of the window. Can be a dynamic temporal size indicated by a timedelta or the following string language: - 1ns (1 nanosecond) - 1us (1 microsecond) - 1ms (1 millisecond) - 1s (1 second) - 1m (1 minute) - 1h (1 hour) - 1d (1 calendar day) - 1w (1 calendar week) - 1mo (1 calendar month) - 1q (1 calendar quarter) - 1y (1 calendar year) - 1i (1 index count) By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year". min_periods The number of values in the window that should be non-null before computing a result. closed : {'left', 'right', 'both', 'none'} Define which sides of the temporal interval are closed (inclusive), defaults to `'right'`. ddof "Delta Degrees of Freedom": The divisor for a length N window is N - ddof Notes ----- If you want to compute multiple aggregation statistics over the same dynamic window, consider using `rolling` - this method can cache the window size computation. Examples -------- Create a DataFrame with a datetime column and a row number column >>> from datetime import timedelta, datetime >>> start = datetime(2001, 1, 1) >>> stop = datetime(2001, 1, 2) >>> df_temporal = pl.DataFrame( ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} ... ).with_row_index() >>> df_temporal shape: (25, 2) ┌───────┬─────────────────────┐ │ index ┆ date │ │ --- ┆ --- │ │ u32 ┆ datetime[μs] │ ╞═══════╪═════════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 │ │ 1 ┆ 2001-01-01 01:00:00 │ │ 2 ┆ 2001-01-01 02:00:00 │ │ 3 ┆ 2001-01-01 03:00:00 │ │ 4 ┆ 2001-01-01 04:00:00 │ │ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 │ │ 21 ┆ 2001-01-01 21:00:00 │ │ 22 ┆ 2001-01-01 22:00:00 │ │ 23 ┆ 2001-01-01 23:00:00 │ │ 24 ┆ 2001-01-02 00:00:00 │ └───────┴─────────────────────┘ Compute the rolling std with the temporal windows closed on the right (default) >>> df_temporal.with_columns( ... rolling_row_std=pl.col("index").rolling_std_by("date", window_size="2h") ... ) shape: (25, 3) ┌───────┬─────────────────────┬─────────────────┐ │ index ┆ date ┆ rolling_row_std │ │ --- ┆ --- ┆ --- │ │ u32 ┆ datetime[μs] ┆ f64 │ ╞═══════╪═════════════════════╪═════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ │ 4 ┆ 2001-01-01 04:00:00 ┆ 0.707107 │ │ … ┆ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 ┆ 0.707107 │ │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ └───────┴─────────────────────┴─────────────────┘ Compute the rolling std with the closure of windows on both sides >>> df_temporal.with_columns( ... rolling_row_std=pl.col("index").rolling_std_by( ... "date", window_size="2h", closed="both" ... ) ... ) shape: (25, 3) ┌───────┬─────────────────────┬─────────────────┐ │ index ┆ date ┆ rolling_row_std │ │ --- ┆ --- ┆ --- │ │ u32 ┆ datetime[μs] ┆ f64 │ ╞═══════╪═════════════════════╪═════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ │ 4 ┆ 2001-01-01 04:00:00 ┆ 1.0 │ │ … ┆ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 ┆ 1.0 │ │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ └───────┴─────────────────────┴─────────────────┘ """ window_size = _prepare_rolling_by_window_args(window_size) by = parse_into_expression(by) return self._from_pyexpr( self._pyexpr.rolling_std_by( by, window_size, min_periods, closed, ddof, ) ) @unstable() def rolling_var_by( self, by: IntoExpr, window_size: timedelta | str, *, min_periods: int = 1, closed: ClosedInterval = "right", ddof: int = 1, ) -> Expr: """ Compute a rolling variance based on another column. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. Given a `by` column `<t_0, t_1, ..., t_n>`, then `closed="right"` (the default) means the windows will be: - (t_0 - window_size, t_0] - (t_1 - window_size, t_1] - ... - (t_n - window_size, t_n] Parameters ---------- by This column must be of dtype Datetime or Date. window_size The length of the window. Can be a dynamic temporal size indicated by a timedelta or the following string language: - 1ns (1 nanosecond) - 1us (1 microsecond) - 1ms (1 millisecond) - 1s (1 second) - 1m (1 minute) - 1h (1 hour) - 1d (1 calendar day) - 1w (1 calendar week) - 1mo (1 calendar month) - 1q (1 calendar quarter) - 1y (1 calendar year) - 1i (1 index count) By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year". min_periods The number of values in the window that should be non-null before computing a result. closed : {'left', 'right', 'both', 'none'} Define which sides of the temporal interval are closed (inclusive), defaults to `'right'`. ddof "Delta Degrees of Freedom": The divisor for a length N window is N - ddof Notes ----- If you want to compute multiple aggregation statistics over the same dynamic window, consider using `rolling` - this method can cache the window size computation. Examples -------- Create a DataFrame with a datetime column and a row number column >>> from datetime import timedelta, datetime >>> start = datetime(2001, 1, 1) >>> stop = datetime(2001, 1, 2) >>> df_temporal = pl.DataFrame( ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} ... ).with_row_index() >>> df_temporal shape: (25, 2) ┌───────┬─────────────────────┐ │ index ┆ date │ │ --- ┆ --- │ │ u32 ┆ datetime[μs] │ ╞═══════╪═════════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 │ │ 1 ┆ 2001-01-01 01:00:00 │ │ 2 ┆ 2001-01-01 02:00:00 │ │ 3 ┆ 2001-01-01 03:00:00 │ │ 4 ┆ 2001-01-01 04:00:00 │ │ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 │ │ 21 ┆ 2001-01-01 21:00:00 │ │ 22 ┆ 2001-01-01 22:00:00 │ │ 23 ┆ 2001-01-01 23:00:00 │ │ 24 ┆ 2001-01-02 00:00:00 │ └───────┴─────────────────────┘ Compute the rolling var with the temporal windows closed on the right (default) >>> df_temporal.with_columns( ... rolling_row_var=pl.col("index").rolling_var_by("date", window_size="2h") ... ) shape: (25, 3) ┌───────┬─────────────────────┬─────────────────┐ │ index ┆ date ┆ rolling_row_var │ │ --- ┆ --- ┆ --- │ │ u32 ┆ datetime[μs] ┆ f64 │ ╞═══════╪═════════════════════╪═════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ │ 4 ┆ 2001-01-01 04:00:00 ┆ 0.5 │ │ … ┆ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 ┆ 0.5 │ │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ └───────┴─────────────────────┴─────────────────┘ Compute the rolling var with the closure of windows on both sides >>> df_temporal.with_columns( ... rolling_row_var=pl.col("index").rolling_var_by( ... "date", window_size="2h", closed="both" ... ) ... ) shape: (25, 3) ┌───────┬─────────────────────┬─────────────────┐ │ index ┆ date ┆ rolling_row_var │ │ --- ┆ --- ┆ --- │ │ u32 ┆ datetime[μs] ┆ f64 │ ╞═══════╪═════════════════════╪═════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ │ 4 ┆ 2001-01-01 04:00:00 ┆ 1.0 │ │ … ┆ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 ┆ 1.0 │ │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ └───────┴─────────────────────┴─────────────────┘ """ window_size = _prepare_rolling_by_window_args(window_size) by = parse_into_expression(by) return self._from_pyexpr( self._pyexpr.rolling_var_by( by, window_size, min_periods, closed, ddof, ) ) @unstable() def rolling_median_by( self, by: IntoExpr, window_size: timedelta | str, *, min_periods: int = 1, closed: ClosedInterval = "right", ) -> Expr: """ Compute a rolling median based on another column. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. Given a `by` column `<t_0, t_1, ..., t_n>`, then `closed="right"` (the default) means the windows will be: - (t_0 - window_size, t_0] - (t_1 - window_size, t_1] - ... - (t_n - window_size, t_n] Parameters ---------- by This column must be of dtype Datetime or Date. window_size The length of the window. Can be a dynamic temporal size indicated by a timedelta or the following string language: - 1ns (1 nanosecond) - 1us (1 microsecond) - 1ms (1 millisecond) - 1s (1 second) - 1m (1 minute) - 1h (1 hour) - 1d (1 calendar day) - 1w (1 calendar week) - 1mo (1 calendar month) - 1q (1 calendar quarter) - 1y (1 calendar year) - 1i (1 index count) By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year". min_periods The number of values in the window that should be non-null before computing a result. closed : {'left', 'right', 'both', 'none'} Define which sides of the temporal interval are closed (inclusive), defaults to `'right'`. Notes ----- If you want to compute multiple aggregation statistics over the same dynamic window, consider using `rolling` - this method can cache the window size computation. Examples -------- Create a DataFrame with a datetime column and a row number column >>> from datetime import timedelta, datetime >>> start = datetime(2001, 1, 1) >>> stop = datetime(2001, 1, 2) >>> df_temporal = pl.DataFrame( ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} ... ).with_row_index() >>> df_temporal shape: (25, 2) ┌───────┬─────────────────────┐ │ index ┆ date │ │ --- ┆ --- │ │ u32 ┆ datetime[μs] │ ╞═══════╪═════════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 │ │ 1 ┆ 2001-01-01 01:00:00 │ │ 2 ┆ 2001-01-01 02:00:00 │ │ 3 ┆ 2001-01-01 03:00:00 │ │ 4 ┆ 2001-01-01 04:00:00 │ │ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 │ │ 21 ┆ 2001-01-01 21:00:00 │ │ 22 ┆ 2001-01-01 22:00:00 │ │ 23 ┆ 2001-01-01 23:00:00 │ │ 24 ┆ 2001-01-02 00:00:00 │ └───────┴─────────────────────┘ Compute the rolling median with the temporal windows closed on the right: >>> df_temporal.with_columns( ... rolling_row_median=pl.col("index").rolling_median_by( ... "date", window_size="2h" ... ) ... ) shape: (25, 3) ┌───────┬─────────────────────┬────────────────────┐ │ index ┆ date ┆ rolling_row_median │ │ --- ┆ --- ┆ --- │ │ u32 ┆ datetime[μs] ┆ f64 │ ╞═══════╪═════════════════════╪════════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.5 │ │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.5 │ │ 4 ┆ 2001-01-01 04:00:00 ┆ 3.5 │ │ … ┆ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 ┆ 19.5 │ │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.5 │ │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.5 │ │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.5 │ │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.5 │ └───────┴─────────────────────┴────────────────────┘ """ window_size = _prepare_rolling_by_window_args(window_size) by = parse_into_expression(by) return self._from_pyexpr( self._pyexpr.rolling_median_by(by, window_size, min_periods, closed) ) @unstable() def rolling_quantile_by( self, by: IntoExpr, window_size: timedelta | str, *, quantile: float, interpolation: RollingInterpolationMethod = "nearest", min_periods: int = 1, closed: ClosedInterval = "right", ) -> Expr: """ Compute a rolling quantile based on another column. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. Given a `by` column `<t_0, t_1, ..., t_n>`, then `closed="right"` (the default) means the windows will be: - (t_0 - window_size, t_0] - (t_1 - window_size, t_1] - ... - (t_n - window_size, t_n] Parameters ---------- by This column must be of dtype Datetime or Date. quantile Quantile between 0.0 and 1.0. interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'} Interpolation method. window_size The length of the window. Can be a dynamic temporal size indicated by a timedelta or the following string language: - 1ns (1 nanosecond) - 1us (1 microsecond) - 1ms (1 millisecond) - 1s (1 second) - 1m (1 minute) - 1h (1 hour) - 1d (1 calendar day) - 1w (1 calendar week) - 1mo (1 calendar month) - 1q (1 calendar quarter) - 1y (1 calendar year) - 1i (1 index count) By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year". min_periods The number of values in the window that should be non-null before computing a result. closed : {'left', 'right', 'both', 'none'} Define which sides of the temporal interval are closed (inclusive), defaults to `'right'`. Notes ----- If you want to compute multiple aggregation statistics over the same dynamic window, consider using `rolling` - this method can cache the window size computation. Examples -------- Create a DataFrame with a datetime column and a row number column >>> from datetime import timedelta, datetime >>> start = datetime(2001, 1, 1) >>> stop = datetime(2001, 1, 2) >>> df_temporal = pl.DataFrame( ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} ... ).with_row_index() >>> df_temporal shape: (25, 2) ┌───────┬─────────────────────┐ │ index ┆ date │ │ --- ┆ --- │ │ u32 ┆ datetime[μs] │ ╞═══════╪═════════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 │ │ 1 ┆ 2001-01-01 01:00:00 │ │ 2 ┆ 2001-01-01 02:00:00 │ │ 3 ┆ 2001-01-01 03:00:00 │ │ 4 ┆ 2001-01-01 04:00:00 │ │ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 │ │ 21 ┆ 2001-01-01 21:00:00 │ │ 22 ┆ 2001-01-01 22:00:00 │ │ 23 ┆ 2001-01-01 23:00:00 │ │ 24 ┆ 2001-01-02 00:00:00 │ └───────┴─────────────────────┘ Compute the rolling quantile with the temporal windows closed on the right: >>> df_temporal.with_columns( ... rolling_row_quantile=pl.col("index").rolling_quantile_by( ... "date", window_size="2h", quantile=0.3 ... ) ... ) shape: (25, 3) ┌───────┬─────────────────────┬──────────────────────┐ │ index ┆ date ┆ rolling_row_quantile │ │ --- ┆ --- ┆ --- │ │ u32 ┆ datetime[μs] ┆ f64 │ ╞═══════╪═════════════════════╪══════════════════════╡ │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ │ 4 ┆ 2001-01-01 04:00:00 ┆ 3.0 │ │ … ┆ … ┆ … │ │ 20 ┆ 2001-01-01 20:00:00 ┆ 19.0 │ │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ └───────┴─────────────────────┴──────────────────────┘ """ window_size = _prepare_rolling_by_window_args(window_size) by = parse_into_expression(by) return self._from_pyexpr( self._pyexpr.rolling_quantile_by( by, quantile, interpolation, window_size, min_periods, closed, ) ) @unstable() def rolling_min( self, window_size: int, weights: list[float] | None = None, *, min_periods: int | None = None, center: bool = False, ) -> Expr: """ Apply a rolling min (moving min) over the values in this array. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. A window of length `window_size` will traverse the array. The values that fill this window will (optionally) be multiplied with the weights given by the `weights` vector. The resulting values will be aggregated to their min. The window at a given row will include the row itself, and the `window_size - 1` elements before it. Parameters ---------- window_size The length of the window in number of elements. weights An optional slice with the same length as the window that will be multiplied elementwise with the values in the window. min_periods The number of values in the window that should be non-null before computing a result. If set to `None` (default), it will be set equal to `window_size`. center Set the labels at the center of the window. Notes ----- If you want to compute multiple aggregation statistics over the same dynamic window, consider using `rolling` - this method can cache the window size computation. Examples -------- >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) >>> df.with_columns( ... rolling_min=pl.col("A").rolling_min(window_size=2), ... ) shape: (6, 2) ┌─────┬─────────────┐ │ A ┆ rolling_min │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪═════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 1.0 │ │ 3.0 ┆ 2.0 │ │ 4.0 ┆ 3.0 │ │ 5.0 ┆ 4.0 │ │ 6.0 ┆ 5.0 │ └─────┴─────────────┘ Specify weights to multiply the values in the window with: >>> df.with_columns( ... rolling_min=pl.col("A").rolling_min( ... window_size=2, weights=[0.25, 0.75] ... ), ... ) shape: (6, 2) ┌─────┬─────────────┐ │ A ┆ rolling_min │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪═════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 0.25 │ │ 3.0 ┆ 0.5 │ │ 4.0 ┆ 0.75 │ │ 5.0 ┆ 1.0 │ │ 6.0 ┆ 1.25 │ └─────┴─────────────┘ Center the values in the window >>> df.with_columns( ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), ... ) shape: (6, 2) ┌─────┬─────────────┐ │ A ┆ rolling_min │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪═════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 1.0 │ │ 3.0 ┆ 2.0 │ │ 4.0 ┆ 3.0 │ │ 5.0 ┆ 4.0 │ │ 6.0 ┆ null │ └─────┴─────────────┘ """ return self._from_pyexpr( self._pyexpr.rolling_min( window_size, weights, min_periods, center=center, ) ) @unstable() def rolling_max( self, window_size: int, weights: list[float] | None = None, *, min_periods: int | None = None, center: bool = False, ) -> Expr: """ Apply a rolling max (moving max) over the values in this array. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. A window of length `window_size` will traverse the array. The values that fill this window will (optionally) be multiplied with the weights given by the `weights` vector. The resulting values will be aggregated to their max. The window at a given row will include the row itself, and the `window_size - 1` elements before it. Parameters ---------- window_size The length of the window in number of elements. weights An optional slice with the same length as the window that will be multiplied elementwise with the values in the window. min_periods The number of values in the window that should be non-null before computing a result. If set to `None` (default), it will be set equal to `window_size`. center Set the labels at the center of the window. Notes ----- If you want to compute multiple aggregation statistics over the same dynamic window, consider using `rolling` - this method can cache the window size computation. Examples -------- >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) >>> df.with_columns( ... rolling_max=pl.col("A").rolling_max(window_size=2), ... ) shape: (6, 2) ┌─────┬─────────────┐ │ A ┆ rolling_max │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪═════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 2.0 │ │ 3.0 ┆ 3.0 │ │ 4.0 ┆ 4.0 │ │ 5.0 ┆ 5.0 │ │ 6.0 ┆ 6.0 │ └─────┴─────────────┘ Specify weights to multiply the values in the window with: >>> df.with_columns( ... rolling_max=pl.col("A").rolling_max( ... window_size=2, weights=[0.25, 0.75] ... ), ... ) shape: (6, 2) ┌─────┬─────────────┐ │ A ┆ rolling_max │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪═════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 1.5 │ │ 3.0 ┆ 2.25 │ │ 4.0 ┆ 3.0 │ │ 5.0 ┆ 3.75 │ │ 6.0 ┆ 4.5 │ └─────┴─────────────┘ Center the values in the window >>> df.with_columns( ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), ... ) shape: (6, 2) ┌─────┬─────────────┐ │ A ┆ rolling_max │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪═════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 3.0 │ │ 3.0 ┆ 4.0 │ │ 4.0 ┆ 5.0 │ │ 5.0 ┆ 6.0 │ │ 6.0 ┆ null │ └─────┴─────────────┘ """ return self._from_pyexpr( self._pyexpr.rolling_max( window_size, weights, min_periods, center, ) ) @unstable() def rolling_mean( self, window_size: int, weights: list[float] | None = None, *, min_periods: int | None = None, center: bool = False, ) -> Expr: """ Apply a rolling mean (moving mean) over the values in this array. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. A window of length `window_size` will traverse the array. The values that fill this window will (optionally) be multiplied with the weights given by the `weights` vector. The resulting values will be aggregated to their mean. The window at a given row will include the row itself, and the `window_size - 1` elements before it. Parameters ---------- window_size The length of the window in number of elements. weights An optional slice with the same length as the window that will be multiplied elementwise with the values in the window. min_periods The number of values in the window that should be non-null before computing a result. If set to `None` (default), it will be set equal to `window_size`. center Set the labels at the center of the window. Notes ----- If you want to compute multiple aggregation statistics over the same dynamic window, consider using `rolling` - this method can cache the window size computation. Examples -------- >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) >>> df.with_columns( ... rolling_mean=pl.col("A").rolling_mean(window_size=2), ... ) shape: (6, 2) ┌─────┬──────────────┐ │ A ┆ rolling_mean │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪══════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 1.5 │ │ 3.0 ┆ 2.5 │ │ 4.0 ┆ 3.5 │ │ 5.0 ┆ 4.5 │ │ 6.0 ┆ 5.5 │ └─────┴──────────────┘ Specify weights to multiply the values in the window with: >>> df.with_columns( ... rolling_mean=pl.col("A").rolling_mean( ... window_size=2, weights=[0.25, 0.75] ... ), ... ) shape: (6, 2) ┌─────┬──────────────┐ │ A ┆ rolling_mean │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪══════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 1.75 │ │ 3.0 ┆ 2.75 │ │ 4.0 ┆ 3.75 │ │ 5.0 ┆ 4.75 │ │ 6.0 ┆ 5.75 │ └─────┴──────────────┘ Center the values in the window >>> df.with_columns( ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), ... ) shape: (6, 2) ┌─────┬──────────────┐ │ A ┆ rolling_mean │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪══════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 2.0 │ │ 3.0 ┆ 3.0 │ │ 4.0 ┆ 4.0 │ │ 5.0 ┆ 5.0 │ │ 6.0 ┆ null │ └─────┴──────────────┘ """ return self._from_pyexpr( self._pyexpr.rolling_mean( window_size, weights, min_periods, center, ) ) @unstable() def rolling_sum( self, window_size: int | timedelta, weights: list[float] | None = None, *, min_periods: int | None = None, center: bool = False, ) -> Expr: """ Apply a rolling sum (moving sum) over the values in this array. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. A window of length `window_size` will traverse the array. The values that fill this window will (optionally) be multiplied with the weights given by the `weights` vector. The resulting values will be aggregated to their sum. The window at a given row will include the row itself, and the `window_size - 1` elements before it. Parameters ---------- window_size The length of the window in number of elements. weights An optional slice with the same length as the window that will be multiplied elementwise with the values in the window. min_periods The number of values in the window that should be non-null before computing a result. If set to `None` (default), it will be set equal to `window_size`. center Set the labels at the center of the window. Notes ----- If you want to compute multiple aggregation statistics over the same dynamic window, consider using `rolling` - this method can cache the window size computation. Examples -------- >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) >>> df.with_columns( ... rolling_sum=pl.col("A").rolling_sum(window_size=2), ... ) shape: (6, 2) ┌─────┬─────────────┐ │ A ┆ rolling_sum │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪═════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 3.0 │ │ 3.0 ┆ 5.0 │ │ 4.0 ┆ 7.0 │ │ 5.0 ┆ 9.0 │ │ 6.0 ┆ 11.0 │ └─────┴─────────────┘ Specify weights to multiply the values in the window with: >>> df.with_columns( ... rolling_sum=pl.col("A").rolling_sum( ... window_size=2, weights=[0.25, 0.75] ... ), ... ) shape: (6, 2) ┌─────┬─────────────┐ │ A ┆ rolling_sum │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪═════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 1.75 │ │ 3.0 ┆ 2.75 │ │ 4.0 ┆ 3.75 │ │ 5.0 ┆ 4.75 │ │ 6.0 ┆ 5.75 │ └─────┴─────────────┘ Center the values in the window >>> df.with_columns( ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), ... ) shape: (6, 2) ┌─────┬─────────────┐ │ A ┆ rolling_sum │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪═════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 6.0 │ │ 3.0 ┆ 9.0 │ │ 4.0 ┆ 12.0 │ │ 5.0 ┆ 15.0 │ │ 6.0 ┆ null │ └─────┴─────────────┘ """ return self._from_pyexpr( self._pyexpr.rolling_sum( window_size, weights, min_periods, center, ) ) @unstable() def rolling_std( self, window_size: int | timedelta, weights: list[float] | None = None, *, min_periods: int | None = None, center: bool = False, ddof: int = 1, ) -> Expr: """ Compute a rolling standard deviation. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. A window of length `window_size` will traverse the array. The values that fill this window will (optionally) be multiplied with the weights given by the `weights` vector. The resulting values will be aggregated to their std. The window at a given row will include the row itself, and the `window_size - 1` elements before it. Parameters ---------- window_size The length of the window in number of elements. weights An optional slice with the same length as the window that will be multiplied elementwise with the values in the window. min_periods The number of values in the window that should be non-null before computing a result. If set to `None` (default), it will be set equal to `window_size`. center Set the labels at the center of the window. ddof "Delta Degrees of Freedom": The divisor for a length N window is N - ddof Notes ----- If you want to compute multiple aggregation statistics over the same dynamic window, consider using `rolling` - this method can cache the window size computation. Examples -------- >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) >>> df.with_columns( ... rolling_std=pl.col("A").rolling_std(window_size=2), ... ) shape: (6, 2) ┌─────┬─────────────┐ │ A ┆ rolling_std │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪═════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 0.707107 │ │ 3.0 ┆ 0.707107 │ │ 4.0 ┆ 0.707107 │ │ 5.0 ┆ 0.707107 │ │ 6.0 ┆ 0.707107 │ └─────┴─────────────┘ Specify weights to multiply the values in the window with: >>> df.with_columns( ... rolling_std=pl.col("A").rolling_std( ... window_size=2, weights=[0.25, 0.75] ... ), ... ) shape: (6, 2) ┌─────┬─────────────┐ │ A ┆ rolling_std │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪═════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 0.433013 │ │ 3.0 ┆ 0.433013 │ │ 4.0 ┆ 0.433013 │ │ 5.0 ┆ 0.433013 │ │ 6.0 ┆ 0.433013 │ └─────┴─────────────┘ Center the values in the window >>> df.with_columns( ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), ... ) shape: (6, 2) ┌─────┬─────────────┐ │ A ┆ rolling_std │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪═════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 1.0 │ │ 3.0 ┆ 1.0 │ │ 4.0 ┆ 1.0 │ │ 5.0 ┆ 1.0 │ │ 6.0 ┆ null │ └─────┴─────────────┘ """ return self._from_pyexpr( self._pyexpr.rolling_std( window_size, weights, min_periods, center=center, ddof=ddof, ) ) @unstable() def rolling_var( self, window_size: int | timedelta, weights: list[float] | None = None, *, min_periods: int | None = None, center: bool = False, ddof: int = 1, ) -> Expr: """ Compute a rolling variance. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. A window of length `window_size` will traverse the array. The values that fill this window will (optionally) be multiplied with the weights given by the `weights` vector. The resulting values will be aggregated to their var. The window at a given row will include the row itself, and the `window_size - 1` elements before it. Parameters ---------- window_size The length of the window in number of elements. weights An optional slice with the same length as the window that will be multiplied elementwise with the values in the window. min_periods The number of values in the window that should be non-null before computing a result. If set to `None` (default), it will be set equal to `window_size`. center Set the labels at the center of the window. ddof "Delta Degrees of Freedom": The divisor for a length N window is N - ddof Notes ----- If you want to compute multiple aggregation statistics over the same dynamic window, consider using `rolling` - this method can cache the window size computation. Examples -------- >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) >>> df.with_columns( ... rolling_var=pl.col("A").rolling_var(window_size=2), ... ) shape: (6, 2) ┌─────┬─────────────┐ │ A ┆ rolling_var │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪═════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 0.5 │ │ 3.0 ┆ 0.5 │ │ 4.0 ┆ 0.5 │ │ 5.0 ┆ 0.5 │ │ 6.0 ┆ 0.5 │ └─────┴─────────────┘ Specify weights to multiply the values in the window with: >>> df.with_columns( ... rolling_var=pl.col("A").rolling_var( ... window_size=2, weights=[0.25, 0.75] ... ), ... ) shape: (6, 2) ┌─────┬─────────────┐ │ A ┆ rolling_var │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪═════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 0.1875 │ │ 3.0 ┆ 0.1875 │ │ 4.0 ┆ 0.1875 │ │ 5.0 ┆ 0.1875 │ │ 6.0 ┆ 0.1875 │ └─────┴─────────────┘ Center the values in the window >>> df.with_columns( ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), ... ) shape: (6, 2) ┌─────┬─────────────┐ │ A ┆ rolling_var │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪═════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 1.0 │ │ 3.0 ┆ 1.0 │ │ 4.0 ┆ 1.0 │ │ 5.0 ┆ 1.0 │ │ 6.0 ┆ null │ └─────┴─────────────┘ """ return self._from_pyexpr( self._pyexpr.rolling_var( window_size, weights, min_periods, center=center, ddof=ddof, ) ) @unstable() def rolling_median( self, window_size: int | timedelta, weights: list[float] | None = None, *, min_periods: int | None = None, center: bool = False, ) -> Expr: """ Compute a rolling median. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. A window of length `window_size` will traverse the array. The values that fill this window will (optionally) be multiplied with the weights given by the `weights` vector. The resulting values will be aggregated to their median. The window at a given row will include the row itself, and the `window_size - 1` elements before it. Parameters ---------- window_size The length of the window in number of elements. weights An optional slice with the same length as the window that will be multiplied elementwise with the values in the window. min_periods The number of values in the window that should be non-null before computing a result. If set to `None` (default), it will be set equal to `window_size`. center Set the labels at the center of the window. Notes ----- If you want to compute multiple aggregation statistics over the same dynamic window, consider using `rolling` - this method can cache the window size computation. Examples -------- >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) >>> df.with_columns( ... rolling_median=pl.col("A").rolling_median(window_size=2), ... ) shape: (6, 2) ┌─────┬────────────────┐ │ A ┆ rolling_median │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪════════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 1.5 │ │ 3.0 ┆ 2.5 │ │ 4.0 ┆ 3.5 │ │ 5.0 ┆ 4.5 │ │ 6.0 ┆ 5.5 │ └─────┴────────────────┘ Specify weights for the values in each window: >>> df.with_columns( ... rolling_median=pl.col("A").rolling_median( ... window_size=2, weights=[0.25, 0.75] ... ), ... ) shape: (6, 2) ┌─────┬────────────────┐ │ A ┆ rolling_median │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪════════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 1.5 │ │ 3.0 ┆ 2.5 │ │ 4.0 ┆ 3.5 │ │ 5.0 ┆ 4.5 │ │ 6.0 ┆ 5.5 │ └─────┴────────────────┘ Center the values in the window >>> df.with_columns( ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), ... ) shape: (6, 2) ┌─────┬────────────────┐ │ A ┆ rolling_median │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪════════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ 2.0 │ │ 3.0 ┆ 3.0 │ │ 4.0 ┆ 4.0 │ │ 5.0 ┆ 5.0 │ │ 6.0 ┆ null │ └─────┴────────────────┘ """ return self._from_pyexpr( self._pyexpr.rolling_median( window_size, weights, min_periods, center=center, ) ) @unstable() def rolling_quantile( self, quantile: float, interpolation: RollingInterpolationMethod = "nearest", window_size: int | timedelta = 2, weights: list[float] | None = None, *, min_periods: int | None = None, center: bool = False, ) -> Expr: """ Compute a rolling quantile. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. A window of length `window_size` will traverse the array. The values that fill this window will (optionally) be multiplied with the weights given by the `weights` vector. The resulting values will be aggregated to their quantile. The window at a given row will include the row itself, and the `window_size - 1` elements before it. Parameters ---------- quantile Quantile between 0.0 and 1.0. interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'} Interpolation method. window_size The length of the window in number of elements. weights An optional slice with the same length as the window that will be multiplied elementwise with the values in the window. min_periods The number of values in the window that should be non-null before computing a result. If set to `None` (default), it will be set equal to `window_size`. center Set the labels at the center of the window. Notes ----- If you want to compute multiple aggregation statistics over the same dynamic window, consider using `rolling` - this method can cache the window size computation. Examples -------- >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) >>> df.with_columns( ... rolling_quantile=pl.col("A").rolling_quantile( ... quantile=0.25, window_size=4 ... ), ... ) shape: (6, 2) ┌─────┬──────────────────┐ │ A ┆ rolling_quantile │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪══════════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ null │ │ 3.0 ┆ null │ │ 4.0 ┆ 2.0 │ │ 5.0 ┆ 3.0 │ │ 6.0 ┆ 4.0 │ └─────┴──────────────────┘ Specify weights for the values in each window: >>> df.with_columns( ... rolling_quantile=pl.col("A").rolling_quantile( ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] ... ), ... ) shape: (6, 2) ┌─────┬──────────────────┐ │ A ┆ rolling_quantile │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪══════════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ null │ │ 3.0 ┆ null │ │ 4.0 ┆ 2.0 │ │ 5.0 ┆ 3.0 │ │ 6.0 ┆ 4.0 │ └─────┴──────────────────┘ Specify weights and interpolation method >>> df.with_columns( ... rolling_quantile=pl.col("A").rolling_quantile( ... quantile=0.25, ... window_size=4, ... weights=[0.2, 0.4, 0.4, 0.2], ... interpolation="linear", ... ), ... ) shape: (6, 2) ┌─────┬──────────────────┐ │ A ┆ rolling_quantile │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪══════════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ null │ │ 3.0 ┆ null │ │ 4.0 ┆ 1.625 │ │ 5.0 ┆ 2.625 │ │ 6.0 ┆ 3.625 │ └─────┴──────────────────┘ Center the values in the window >>> df.with_columns( ... rolling_quantile=pl.col("A").rolling_quantile( ... quantile=0.2, window_size=5, center=True ... ), ... ) shape: (6, 2) ┌─────┬──────────────────┐ │ A ┆ rolling_quantile │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪══════════════════╡ │ 1.0 ┆ null │ │ 2.0 ┆ null │ │ 3.0 ┆ 2.0 │ │ 4.0 ┆ 3.0 │ │ 5.0 ┆ null │ │ 6.0 ┆ null │ └─────┴──────────────────┘ """ return self._from_pyexpr( self._pyexpr.rolling_quantile( quantile, interpolation, window_size, weights, min_periods, center=center, ) ) @unstable() def rolling_skew(self, window_size: int, *, bias: bool = True) -> Expr: """ Compute a rolling skew. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. The window at a given row will include the row itself, and the `window_size - 1` elements before it. Parameters ---------- window_size Integer size of the rolling window. bias If False, the calculations are corrected for statistical bias. Examples -------- >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) >>> df.select(pl.col("a").rolling_skew(3)) shape: (4, 1) ┌──────────┐ │ a │ │ --- │ │ f64 │ ╞══════════╡ │ null │ │ null │ │ 0.381802 │ │ 0.47033 │ └──────────┘ Note how the values match the following: >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() (0.38180177416060584, 0.47033046033698594) """ return self._from_pyexpr(self._pyexpr.rolling_skew(window_size, bias)) @unstable() def rolling_map( self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = None, *, min_periods: int | None = None, center: bool = False, ) -> Expr: """ Compute a custom rolling window function. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. Parameters ---------- function Custom aggregation function. window_size The length of the window in number of elements. weights An optional slice with the same length as the window that will be multiplied elementwise with the values in the window. min_periods The number of values in the window that should be non-null before computing a result. If set to `None` (default), it will be set equal to `window_size`. center Set the labels at the center of the window. Warnings -------- Computing custom functions is extremely slow. Use specialized rolling functions such as :func:`Expr.rolling_sum` if at all possible. Examples -------- >>> from numpy import nansum >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) shape: (5, 1) ┌──────┐ │ a │ │ --- │ │ f64 │ ╞══════╡ │ null │ │ null │ │ 22.0 │ │ 11.0 │ │ 17.0 │ └──────┘ """ if min_periods is None: min_periods = window_size return self._from_pyexpr( self._pyexpr.rolling_map( function, window_size, weights, min_periods, center ) ) def abs(self) -> Expr: """ Compute absolute values. Same as `abs(expr)`. Examples -------- >>> df = pl.DataFrame( ... { ... "A": [-1.0, 0.0, 1.0, 2.0], ... } ... ) >>> df.select(pl.col("A").abs()) shape: (4, 1) ┌─────┐ │ A │ │ --- │ │ f64 │ ╞═════╡ │ 1.0 │ │ 0.0 │ │ 1.0 │ │ 2.0 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.abs()) def rank( self, method: RankMethod = "average", *, descending: bool = False, seed: int | None = None, ) -> Expr: """ Assign ranks to data, dealing with ties appropriately. Parameters ---------- method : {'average', 'min', 'max', 'dense', 'ordinal', 'random'} The method used to assign ranks to tied elements. The following methods are available (default is 'average'): - 'average' : The average of the ranks that would have been assigned to all the tied values is assigned to each value. - 'min' : The minimum of the ranks that would have been assigned to all the tied values is assigned to each value. (This is also referred to as "competition" ranking.) - 'max' : The maximum of the ranks that would have been assigned to all the tied values is assigned to each value. - 'dense' : Like 'min', but the rank of the next highest element is assigned the rank immediately after those assigned to the tied elements. - 'ordinal' : All values are given a distinct rank, corresponding to the order that the values occur in the Series. - 'random' : Like 'ordinal', but the rank for ties is not dependent on the order that the values occur in the Series. descending Rank in descending order. seed If `method="random"`, use this as seed. Examples -------- The 'average' method: >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) >>> df.select(pl.col("a").rank()) shape: (5, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ 3.0 │ │ 4.5 │ │ 1.5 │ │ 1.5 │ │ 4.5 │ └─────┘ The 'ordinal' method: >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) >>> df.select(pl.col("a").rank("ordinal")) shape: (5, 1) ┌─────┐ │ a │ │ --- │ │ u32 │ ╞═════╡ │ 3 │ │ 4 │ │ 1 │ │ 2 │ │ 5 │ └─────┘ Use 'rank' with 'over' to rank within groups: >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) shape: (5, 3) ┌─────┬─────┬──────┐ │ a ┆ b ┆ rank │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ f64 │ ╞═════╪═════╪══════╡ │ 1 ┆ 6 ┆ 1.0 │ │ 1 ┆ 7 ┆ 2.0 │ │ 2 ┆ 5 ┆ 1.0 │ │ 2 ┆ 14 ┆ 3.0 │ │ 2 ┆ 11 ┆ 2.0 │ └─────┴─────┴──────┘ """ return self._from_pyexpr(self._pyexpr.rank(method, descending, seed)) def diff(self, n: int = 1, null_behavior: NullBehavior = "ignore") -> Expr: """ Calculate the first discrete difference between shifted items. Parameters ---------- n Number of slots to shift. null_behavior : {'ignore', 'drop'} How to handle null values. Examples -------- >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) >>> df.with_columns(change=pl.col("int").diff()) shape: (5, 2) ┌─────┬────────┐ │ int ┆ change │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪════════╡ │ 20 ┆ null │ │ 10 ┆ -10 │ │ 30 ┆ 20 │ │ 25 ┆ -5 │ │ 35 ┆ 10 │ └─────┴────────┘ >>> df.with_columns(change=pl.col("int").diff(n=2)) shape: (5, 2) ┌─────┬────────┐ │ int ┆ change │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪════════╡ │ 20 ┆ null │ │ 10 ┆ null │ │ 30 ┆ 10 │ │ 25 ┆ 15 │ │ 35 ┆ 5 │ └─────┴────────┘ >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) shape: (3, 1) ┌──────┐ │ diff │ │ --- │ │ i64 │ ╞══════╡ │ 10 │ │ 15 │ │ 5 │ └──────┘ """ return self._from_pyexpr(self._pyexpr.diff(n, null_behavior)) def pct_change(self, n: int | IntoExprColumn = 1) -> Expr: """ Computes percentage change between values. Percentage change (as fraction) between current element and most-recent non-null element at least `n` period(s) before the current element. Computes the change from the previous row by default. Parameters ---------- n periods to shift for forming percent change. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [10, 11, 12, None, 12], ... } ... ) >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) shape: (5, 2) ┌──────┬────────────┐ │ a ┆ pct_change │ │ --- ┆ --- │ │ i64 ┆ f64 │ ╞══════╪════════════╡ │ 10 ┆ null │ │ 11 ┆ 0.1 │ │ 12 ┆ 0.090909 │ │ null ┆ 0.0 │ │ 12 ┆ 0.0 │ └──────┴────────────┘ """ n = parse_into_expression(n) return self._from_pyexpr(self._pyexpr.pct_change(n)) def skew(self, *, bias: bool = True) -> Expr: r""" Compute the sample skewness of a data set. For normally distributed data, the skewness should be about zero. For unimodal continuous distributions, a skewness value greater than zero means that there is more weight in the right tail of the distribution. The function `skewtest` can be used to determine if the skewness value is close enough to zero, statistically speaking. See scipy.stats for more information. Parameters ---------- bias : bool, optional If False, the calculations are corrected for statistical bias. Notes ----- The sample skewness is computed as the Fisher-Pearson coefficient of skewness, i.e. .. math:: g_1=\frac{m_3}{m_2^{3/2}} where .. math:: m_i=\frac{1}{N}\sum_{n=1}^N(x[n]-\bar{x})^i is the biased sample :math:`i\texttt{th}` central moment, and :math:`\bar{x}` is the sample mean. If `bias` is False, the calculations are corrected for bias and the value computed is the adjusted Fisher-Pearson standardized moment coefficient, i.e. .. math:: G_1 = \frac{k_3}{k_2^{3/2}} = \frac{\sqrt{N(N-1)}}{N-2}\frac{m_3}{m_2^{3/2}} Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) >>> df.select(pl.col("a").skew()) shape: (1, 1) ┌──────────┐ │ a │ │ --- │ │ f64 │ ╞══════════╡ │ 0.343622 │ └──────────┘ """ return self._from_pyexpr(self._pyexpr.skew(bias)) def kurtosis(self, *, fisher: bool = True, bias: bool = True) -> Expr: """ Compute the kurtosis (Fisher or Pearson) of a dataset. Kurtosis is the fourth central moment divided by the square of the variance. If Fisher's definition is used, then 3.0 is subtracted from the result to give 0.0 for a normal distribution. If bias is False then the kurtosis is calculated using k statistics to eliminate bias coming from biased moment estimators. See scipy.stats for more information Parameters ---------- fisher : bool, optional If True, Fisher's definition is used (normal ==> 0.0). If False, Pearson's definition is used (normal ==> 3.0). bias : bool, optional If False, the calculations are corrected for statistical bias. Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) >>> df.select(pl.col("a").kurtosis()) shape: (1, 1) ┌───────────┐ │ a │ │ --- │ │ f64 │ ╞═══════════╡ │ -1.153061 │ └───────────┘ """ return self._from_pyexpr(self._pyexpr.kurtosis(fisher, bias)) def clip( self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = None, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = None, ) -> Expr: """ Set values outside the given boundaries to the boundary value. Parameters ---------- lower_bound Lower bound. Accepts expression input. Non-expression inputs are parsed as literals. upper_bound Upper bound. Accepts expression input. Non-expression inputs are parsed as literals. See Also -------- when Notes ----- This method only works for numeric and temporal columns. To clip other data types, consider writing a `when-then-otherwise` expression. See :func:`when`. Examples -------- Specifying both a lower and upper bound: >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) >>> df.with_columns(clip=pl.col("a").clip(1, 10)) shape: (4, 2) ┌──────┬──────┐ │ a ┆ clip │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞══════╪══════╡ │ -50 ┆ 1 │ │ 5 ┆ 5 │ │ 50 ┆ 10 │ │ null ┆ null │ └──────┴──────┘ Specifying only a single bound: >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) shape: (4, 2) ┌──────┬──────┐ │ a ┆ clip │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞══════╪══════╡ │ -50 ┆ -50 │ │ 5 ┆ 5 │ │ 50 ┆ 10 │ │ null ┆ null │ └──────┴──────┘ """ if lower_bound is not None: lower_bound = parse_into_expression(lower_bound) if upper_bound is not None: upper_bound = parse_into_expression(upper_bound) return self._from_pyexpr(self._pyexpr.clip(lower_bound, upper_bound)) def lower_bound(self) -> Expr: """ Calculate the lower bound. Returns a unit Series with the lowest value possible for the dtype of this expression. Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) >>> df.select(pl.col("a").lower_bound()) shape: (1, 1) ┌──────────────────────┐ │ a │ │ --- │ │ i64 │ ╞══════════════════════╡ │ -9223372036854775808 │ └──────────────────────┘ """ return self._from_pyexpr(self._pyexpr.lower_bound()) def upper_bound(self) -> Expr: """ Calculate the upper bound. Returns a unit Series with the highest value possible for the dtype of this expression. Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) >>> df.select(pl.col("a").upper_bound()) shape: (1, 1) ┌─────────────────────┐ │ a │ │ --- │ │ i64 │ ╞═════════════════════╡ │ 9223372036854775807 │ └─────────────────────┘ """ return self._from_pyexpr(self._pyexpr.upper_bound()) def sign(self) -> Expr: """ Compute the element-wise sign function on numeric types. The returned value is computed as follows: * -1 if x < 0. * 1 if x > 0. * x otherwise (typically 0, but could be NaN if the input is). Null values are preserved as-is, and the dtype of the input is preserved. Examples -------- >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, float("nan"), None]}) >>> df.select(pl.col.a.sign()) shape: (6, 1) ┌──────┐ │ a │ │ --- │ │ f64 │ ╞══════╡ │ -1.0 │ │ -0.0 │ │ 0.0 │ │ 1.0 │ │ NaN │ │ null │ └──────┘ """ return self._from_pyexpr(self._pyexpr.sign()) def sin(self) -> Expr: """ Compute the element-wise value for the sine. Returns ------- Expr Expression of data type :class:`Float64`. Examples -------- >>> df = pl.DataFrame({"a": [0.0]}) >>> df.select(pl.col("a").sin()) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ 0.0 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.sin()) def cos(self) -> Expr: """ Compute the element-wise value for the cosine. Returns ------- Expr Expression of data type :class:`Float64`. Examples -------- >>> df = pl.DataFrame({"a": [0.0]}) >>> df.select(pl.col("a").cos()) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ 1.0 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.cos()) def tan(self) -> Expr: """ Compute the element-wise value for the tangent. Returns ------- Expr Expression of data type :class:`Float64`. Examples -------- >>> df = pl.DataFrame({"a": [1.0]}) >>> df.select(pl.col("a").tan().round(2)) shape: (1, 1) ┌──────┐ │ a │ │ --- │ │ f64 │ ╞══════╡ │ 1.56 │ └──────┘ """ return self._from_pyexpr(self._pyexpr.tan()) def cot(self) -> Expr: """ Compute the element-wise value for the cotangent. Returns ------- Expr Expression of data type :class:`Float64`. Examples -------- >>> df = pl.DataFrame({"a": [1.0]}) >>> df.select(pl.col("a").cot().round(2)) shape: (1, 1) ┌──────┐ │ a │ │ --- │ │ f64 │ ╞══════╡ │ 0.64 │ └──────┘ """ return self._from_pyexpr(self._pyexpr.cot()) def arcsin(self) -> Expr: """ Compute the element-wise value for the inverse sine. Returns ------- Expr Expression of data type :class:`Float64`. Examples -------- >>> df = pl.DataFrame({"a": [1.0]}) >>> df.select(pl.col("a").arcsin()) shape: (1, 1) ┌──────────┐ │ a │ │ --- │ │ f64 │ ╞══════════╡ │ 1.570796 │ └──────────┘ """ return self._from_pyexpr(self._pyexpr.arcsin()) def arccos(self) -> Expr: """ Compute the element-wise value for the inverse cosine. Returns ------- Expr Expression of data type :class:`Float64`. Examples -------- >>> df = pl.DataFrame({"a": [0.0]}) >>> df.select(pl.col("a").arccos()) shape: (1, 1) ┌──────────┐ │ a │ │ --- │ │ f64 │ ╞══════════╡ │ 1.570796 │ └──────────┘ """ return self._from_pyexpr(self._pyexpr.arccos()) def arctan(self) -> Expr: """ Compute the element-wise value for the inverse tangent. Returns ------- Expr Expression of data type :class:`Float64`. Examples -------- >>> df = pl.DataFrame({"a": [1.0]}) >>> df.select(pl.col("a").arctan()) shape: (1, 1) ┌──────────┐ │ a │ │ --- │ │ f64 │ ╞══════════╡ │ 0.785398 │ └──────────┘ """ return self._from_pyexpr(self._pyexpr.arctan()) def sinh(self) -> Expr: """ Compute the element-wise value for the hyperbolic sine. Returns ------- Expr Expression of data type :class:`Float64`. Examples -------- >>> df = pl.DataFrame({"a": [1.0]}) >>> df.select(pl.col("a").sinh()) shape: (1, 1) ┌──────────┐ │ a │ │ --- │ │ f64 │ ╞══════════╡ │ 1.175201 │ └──────────┘ """ return self._from_pyexpr(self._pyexpr.sinh()) def cosh(self) -> Expr: """ Compute the element-wise value for the hyperbolic cosine. Returns ------- Expr Expression of data type :class:`Float64`. Examples -------- >>> df = pl.DataFrame({"a": [1.0]}) >>> df.select(pl.col("a").cosh()) shape: (1, 1) ┌──────────┐ │ a │ │ --- │ │ f64 │ ╞══════════╡ │ 1.543081 │ └──────────┘ """ return self._from_pyexpr(self._pyexpr.cosh()) def tanh(self) -> Expr: """ Compute the element-wise value for the hyperbolic tangent. Returns ------- Expr Expression of data type :class:`Float64`. Examples -------- >>> df = pl.DataFrame({"a": [1.0]}) >>> df.select(pl.col("a").tanh()) shape: (1, 1) ┌──────────┐ │ a │ │ --- │ │ f64 │ ╞══════════╡ │ 0.761594 │ └──────────┘ """ return self._from_pyexpr(self._pyexpr.tanh()) def arcsinh(self) -> Expr: """ Compute the element-wise value for the inverse hyperbolic sine. Returns ------- Expr Expression of data type :class:`Float64`. Examples -------- >>> df = pl.DataFrame({"a": [1.0]}) >>> df.select(pl.col("a").arcsinh()) shape: (1, 1) ┌──────────┐ │ a │ │ --- │ │ f64 │ ╞══════════╡ │ 0.881374 │ └──────────┘ """ return self._from_pyexpr(self._pyexpr.arcsinh()) def arccosh(self) -> Expr: """ Compute the element-wise value for the inverse hyperbolic cosine. Returns ------- Expr Expression of data type :class:`Float64`. Examples -------- >>> df = pl.DataFrame({"a": [1.0]}) >>> df.select(pl.col("a").arccosh()) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ 0.0 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.arccosh()) def arctanh(self) -> Expr: """ Compute the element-wise value for the inverse hyperbolic tangent. Returns ------- Expr Expression of data type :class:`Float64`. Examples -------- >>> df = pl.DataFrame({"a": [1.0]}) >>> df.select(pl.col("a").arctanh()) shape: (1, 1) ┌─────┐ │ a │ │ --- │ │ f64 │ ╞═════╡ │ inf │ └─────┘ """ return self._from_pyexpr(self._pyexpr.arctanh()) def degrees(self) -> Expr: """ Convert from radians to degrees. Returns ------- Expr Expression of data type :class:`Float64`. Examples -------- >>> import math >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) >>> df.select(pl.col("a").degrees()) shape: (9, 1) ┌────────┐ │ a │ │ --- │ │ f64 │ ╞════════╡ │ -720.0 │ │ -540.0 │ │ -360.0 │ │ -180.0 │ │ 0.0 │ │ 180.0 │ │ 360.0 │ │ 540.0 │ │ 720.0 │ └────────┘ """ return self._from_pyexpr(self._pyexpr.degrees()) def radians(self) -> Expr: """ Convert from degrees to radians. Returns ------- Expr Expression of data type :class:`Float64`. Examples -------- >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) >>> df.select(pl.col("a").radians()) shape: (9, 1) ┌────────────┐ │ a │ │ --- │ │ f64 │ ╞════════════╡ │ -12.566371 │ │ -9.424778 │ │ -6.283185 │ │ -3.141593 │ │ 0.0 │ │ 3.141593 │ │ 6.283185 │ │ 9.424778 │ │ 12.566371 │ └────────────┘ """ return self._from_pyexpr(self._pyexpr.radians()) def reshape(self, dimensions: tuple[int, ...]) -> Expr: """ Reshape this Expr to a flat column or an Array column. Parameters ---------- dimensions Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that dimension is inferred. Returns ------- Expr If a single dimension is given, results in an expression of the original data type. If a multiple dimensions are given, results in an expression of data type :class:`Array` with shape `dimensions`. Examples -------- >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) >>> square = df.select(pl.col("foo").reshape((3, 3))) >>> square shape: (3, 1) ┌───────────────┐ │ foo │ │ --- │ │ array[i64, 3] │ ╞═══════════════╡ │ [1, 2, 3] │ │ [4, 5, 6] │ │ [7, 8, 9] │ └───────────────┘ >>> square.select(pl.col("foo").reshape((9,))) shape: (9, 1) ┌─────┐ │ foo │ │ --- │ │ i64 │ ╞═════╡ │ 1 │ │ 2 │ │ 3 │ │ 4 │ │ 5 │ │ 6 │ │ 7 │ │ 8 │ │ 9 │ └─────┘ See Also -------- Expr.list.explode : Explode a list column. """ return self._from_pyexpr(self._pyexpr.reshape(dimensions)) def shuffle(self, seed: int | None = None) -> Expr: """ Shuffle the contents of this expression. Note this is shuffled independently of any other column or Expression. If you want each row to stay the same use df.sample(shuffle=True) Parameters ---------- seed Seed for the random number generator. If set to None (default), a random seed is generated each time the shuffle is called. Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3]}) >>> df.select(pl.col("a").shuffle(seed=1)) shape: (3, 1) ┌─────┐ │ a │ │ --- │ │ i64 │ ╞═════╡ │ 2 │ │ 1 │ │ 3 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.shuffle(seed)) def sample( self, n: int | IntoExprColumn | None = None, *, fraction: float | IntoExprColumn | None = None, with_replacement: bool = False, shuffle: bool = False, seed: int | None = None, ) -> Expr: """ Sample from this expression. Parameters ---------- n Number of items to return. Cannot be used with `fraction`. Defaults to 1 if `fraction` is None. fraction Fraction of items to return. Cannot be used with `n`. with_replacement Allow values to be sampled more than once. shuffle Shuffle the order of sampled data points. seed Seed for the random number generator. If set to None (default), a random seed is generated for each sample operation. Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3]}) >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) shape: (3, 1) ┌─────┐ │ a │ │ --- │ │ i64 │ ╞═════╡ │ 3 │ │ 1 │ │ 1 │ └─────┘ """ if n is not None and fraction is not None: msg = "cannot specify both `n` and `fraction`" raise ValueError(msg) if fraction is not None: fraction = parse_into_expression(fraction) return self._from_pyexpr( self._pyexpr.sample_frac(fraction, with_replacement, shuffle, seed) ) if n is None: n = 1 n = parse_into_expression(n) return self._from_pyexpr( self._pyexpr.sample_n(n, with_replacement, shuffle, seed) ) def ewm_mean( self, *, com: float | None = None, span: float | None = None, half_life: float | None = None, alpha: float | None = None, adjust: bool = True, min_periods: int = 1, ignore_nulls: bool = False, ) -> Expr: r""" Compute exponentially-weighted moving average. Parameters ---------- com Specify decay in terms of center of mass, :math:`\gamma`, with .. math:: \alpha = \frac{1}{1 + \gamma} \; \forall \; \gamma \geq 0 span Specify decay in terms of span, :math:`\theta`, with .. math:: \alpha = \frac{2}{\theta + 1} \; \forall \; \theta \geq 1 half_life Specify decay in terms of half-life, :math:`\tau`, with .. math:: \alpha = 1 - \exp \left\{ \frac{ -\ln(2) }{ \tau } \right\} \; \forall \; \tau > 0 alpha Specify smoothing factor alpha directly, :math:`0 < \alpha \leq 1`. adjust Divide by decaying adjustment factor in beginning periods to account for imbalance in relative weightings - When `adjust=True` (the default) the EW function is calculated using weights :math:`w_i = (1 - \alpha)^i` - When `adjust=False` the EW function is calculated recursively by .. math:: y_0 &= x_0 \\ y_t &= (1 - \alpha)y_{t - 1} + \alpha x_t min_periods Minimum number of observations in window required to have a value (otherwise result is null). ignore_nulls Ignore missing values when calculating weights. - When `ignore_nulls=False` (default), weights are based on absolute positions. For example, the weights of :math:`x_0` and :math:`x_2` used in calculating the final weighted average of [:math:`x_0`, None, :math:`x_2`] are :math:`(1-\alpha)^2` and :math:`1` if `adjust=True`, and :math:`(1-\alpha)^2` and :math:`\alpha` if `adjust=False`. - When `ignore_nulls=True`, weights are based on relative positions. For example, the weights of :math:`x_0` and :math:`x_2` used in calculating the final weighted average of [:math:`x_0`, None, :math:`x_2`] are :math:`1-\alpha` and :math:`1` if `adjust=True`, and :math:`1-\alpha` and :math:`\alpha` if `adjust=False`. Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3]}) >>> df.select(pl.col("a").ewm_mean(com=1, ignore_nulls=False)) shape: (3, 1) ┌──────────┐ │ a │ │ --- │ │ f64 │ ╞══════════╡ │ 1.0 │ │ 1.666667 │ │ 2.428571 │ └──────────┘ """ alpha = _prepare_alpha(com, span, half_life, alpha) return self._from_pyexpr( self._pyexpr.ewm_mean(alpha, adjust, min_periods, ignore_nulls) ) def ewm_mean_by( self, by: str | IntoExpr, *, half_life: str | timedelta, ) -> Expr: r""" Compute time-based exponentially weighted moving average. Given observations :math:`x_0, x_1, \ldots, x_{n-1}` at times :math:`t_0, t_1, \ldots, t_{n-1}`, the EWMA is calculated as .. math:: y_0 &= x_0 \alpha_i &= 1 - \exp \left\{ \frac{ -\ln(2)(t_i-t_{i-1}) } { \tau } \right\} y_i &= \alpha_i x_i + (1 - \alpha_i) y_{i-1}; \quad i > 0 where :math:`\tau` is the `half_life`. Parameters ---------- by Times to calculate average by. Should be ``DateTime``, ``Date``, ``UInt64``, ``UInt32``, ``Int64``, or ``Int32`` data type. half_life Unit over which observation decays to half its value. Can be created either from a timedelta, or by using the following string language: - 1ns (1 nanosecond) - 1us (1 microsecond) - 1ms (1 millisecond) - 1s (1 second) - 1m (1 minute) - 1h (1 hour) - 1d (1 day) - 1w (1 week) - 1i (1 index count) Or combine them: "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds Note that `half_life` is treated as a constant duration - calendar durations such as months (or even days in the time-zone-aware case) are not supported, please express your duration in an approximately equivalent number of hours (e.g. '370h' instead of '1mo'). Returns ------- Expr Float32 if input is Float32, otherwise Float64. Examples -------- >>> from datetime import date, timedelta >>> df = pl.DataFrame( ... { ... "values": [0, 1, 2, None, 4], ... "times": [ ... date(2020, 1, 1), ... date(2020, 1, 3), ... date(2020, 1, 10), ... date(2020, 1, 15), ... date(2020, 1, 17), ... ], ... } ... ).sort("times") >>> df.with_columns( ... result=pl.col("values").ewm_mean_by("times", half_life="4d"), ... ) shape: (5, 3) ┌────────┬────────────┬──────────┐ │ values ┆ times ┆ result │ │ --- ┆ --- ┆ --- │ │ i64 ┆ date ┆ f64 │ ╞════════╪════════════╪══════════╡ │ 0 ┆ 2020-01-01 ┆ 0.0 │ │ 1 ┆ 2020-01-03 ┆ 0.292893 │ │ 2 ┆ 2020-01-10 ┆ 1.492474 │ │ null ┆ 2020-01-15 ┆ null │ │ 4 ┆ 2020-01-17 ┆ 3.254508 │ └────────┴────────────┴──────────┘ """ by = parse_into_expression(by) half_life = parse_as_duration_string(half_life) return self._from_pyexpr(self._pyexpr.ewm_mean_by(by, half_life)) def ewm_std( self, *, com: float | None = None, span: float | None = None, half_life: float | None = None, alpha: float | None = None, adjust: bool = True, bias: bool = False, min_periods: int = 1, ignore_nulls: bool = False, ) -> Expr: r""" Compute exponentially-weighted moving standard deviation. Parameters ---------- com Specify decay in terms of center of mass, :math:`\gamma`, with .. math:: \alpha = \frac{1}{1 + \gamma} \; \forall \; \gamma \geq 0 span Specify decay in terms of span, :math:`\theta`, with .. math:: \alpha = \frac{2}{\theta + 1} \; \forall \; \theta \geq 1 half_life Specify decay in terms of half-life, :math:`\lambda`, with .. math:: \alpha = 1 - \exp \left\{ \frac{ -\ln(2) }{ \lambda } \right\} \; \forall \; \lambda > 0 alpha Specify smoothing factor alpha directly, :math:`0 < \alpha \leq 1`. adjust Divide by decaying adjustment factor in beginning periods to account for imbalance in relative weightings - When `adjust=True` (the default) the EW function is calculated using weights :math:`w_i = (1 - \alpha)^i` - When `adjust=False` the EW function is calculated recursively by .. math:: y_0 &= x_0 \\ y_t &= (1 - \alpha)y_{t - 1} + \alpha x_t bias When `bias=False`, apply a correction to make the estimate statistically unbiased. min_periods Minimum number of observations in window required to have a value (otherwise result is null). ignore_nulls Ignore missing values when calculating weights. - When `ignore_nulls=False` (default), weights are based on absolute positions. For example, the weights of :math:`x_0` and :math:`x_2` used in calculating the final weighted average of [:math:`x_0`, None, :math:`x_2`] are :math:`(1-\alpha)^2` and :math:`1` if `adjust=True`, and :math:`(1-\alpha)^2` and :math:`\alpha` if `adjust=False`. - When `ignore_nulls=True`, weights are based on relative positions. For example, the weights of :math:`x_0` and :math:`x_2` used in calculating the final weighted average of [:math:`x_0`, None, :math:`x_2`] are :math:`1-\alpha` and :math:`1` if `adjust=True`, and :math:`1-\alpha` and :math:`\alpha` if `adjust=False`. Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3]}) >>> df.select(pl.col("a").ewm_std(com=1, ignore_nulls=False)) shape: (3, 1) ┌──────────┐ │ a │ │ --- │ │ f64 │ ╞══════════╡ │ 0.0 │ │ 0.707107 │ │ 0.963624 │ └──────────┘ """ alpha = _prepare_alpha(com, span, half_life, alpha) return self._from_pyexpr( self._pyexpr.ewm_std(alpha, adjust, bias, min_periods, ignore_nulls) ) def ewm_var( self, *, com: float | None = None, span: float | None = None, half_life: float | None = None, alpha: float | None = None, adjust: bool = True, bias: bool = False, min_periods: int = 1, ignore_nulls: bool = False, ) -> Expr: r""" Compute exponentially-weighted moving variance. Parameters ---------- com Specify decay in terms of center of mass, :math:`\gamma`, with .. math:: \alpha = \frac{1}{1 + \gamma} \; \forall \; \gamma \geq 0 span Specify decay in terms of span, :math:`\theta`, with .. math:: \alpha = \frac{2}{\theta + 1} \; \forall \; \theta \geq 1 half_life Specify decay in terms of half-life, :math:`\lambda`, with .. math:: \alpha = 1 - \exp \left\{ \frac{ -\ln(2) }{ \lambda } \right\} \; \forall \; \lambda > 0 alpha Specify smoothing factor alpha directly, :math:`0 < \alpha \leq 1`. adjust Divide by decaying adjustment factor in beginning periods to account for imbalance in relative weightings - When `adjust=True` (the default) the EW function is calculated using weights :math:`w_i = (1 - \alpha)^i` - When `adjust=False` the EW function is calculated recursively by .. math:: y_0 &= x_0 \\ y_t &= (1 - \alpha)y_{t - 1} + \alpha x_t bias When `bias=False`, apply a correction to make the estimate statistically unbiased. min_periods Minimum number of observations in window required to have a value (otherwise result is null). ignore_nulls Ignore missing values when calculating weights. - When `ignore_nulls=False` (default), weights are based on absolute positions. For example, the weights of :math:`x_0` and :math:`x_2` used in calculating the final weighted average of [:math:`x_0`, None, :math:`x_2`] are :math:`(1-\alpha)^2` and :math:`1` if `adjust=True`, and :math:`(1-\alpha)^2` and :math:`\alpha` if `adjust=False`. - When `ignore_nulls=True`, weights are based on relative positions. For example, the weights of :math:`x_0` and :math:`x_2` used in calculating the final weighted average of [:math:`x_0`, None, :math:`x_2`] are :math:`1-\alpha` and :math:`1` if `adjust=True`, and :math:`1-\alpha` and :math:`\alpha` if `adjust=False`. Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3]}) >>> df.select(pl.col("a").ewm_var(com=1, ignore_nulls=False)) shape: (3, 1) ┌──────────┐ │ a │ │ --- │ │ f64 │ ╞══════════╡ │ 0.0 │ │ 0.5 │ │ 0.928571 │ └──────────┘ """ alpha = _prepare_alpha(com, span, half_life, alpha) return self._from_pyexpr( self._pyexpr.ewm_var(alpha, adjust, bias, min_periods, ignore_nulls) ) def extend_constant(self, value: IntoExpr, n: int | IntoExprColumn) -> Expr: """ Extremely fast method for extending the Series with 'n' copies of a value. Parameters ---------- value A constant literal value or a unit expression with which to extend the expression result Series; can pass None to extend with nulls. n The number of additional values that will be added. Examples -------- >>> df = pl.DataFrame({"values": [1, 2, 3]}) >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) shape: (5, 1) ┌────────┐ │ values │ │ --- │ │ i64 │ ╞════════╡ │ 0 │ │ 1 │ │ 2 │ │ 99 │ │ 99 │ └────────┘ """ value = parse_into_expression(value, str_as_lit=True) n = parse_into_expression(n) return self._from_pyexpr(self._pyexpr.extend_constant(value, n)) def value_counts( self, *, sort: bool = False, parallel: bool = False, name: str | None = None, normalize: bool = False, ) -> Expr: """ Count the occurrences of unique values. Parameters ---------- sort Sort the output by count in descending order. If set to `False` (default), the order of the output is random. parallel Execute the computation in parallel. .. note:: This option should likely not be enabled in a group by context, as the computation is already parallelized per group. name Give the resulting count column a specific name; if `normalize` is True defaults to "proportion", otherwise defaults to "count". normalize If true gives relative frequencies of the unique values Returns ------- Expr Expression of data type :class:`Struct` with mapping of unique values to their count. Examples -------- >>> df = pl.DataFrame( ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} ... ) >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT shape: (3, 1) ┌─────────────┐ │ color │ │ --- │ │ struct[2] │ ╞═════════════╡ │ {"red",2} │ │ {"green",1} │ │ {"blue",3} │ └─────────────┘ Sort the output by (descending) count and customize the count field name. >>> df = df.select(pl.col("color").value_counts(sort=True, name="n")) >>> df shape: (3, 1) ┌─────────────┐ │ color │ │ --- │ │ struct[2] │ ╞═════════════╡ │ {"blue",3} │ │ {"red",2} │ │ {"green",1} │ └─────────────┘ >>> df.unnest("color") shape: (3, 2) ┌───────┬─────┐ │ color ┆ n │ │ --- ┆ --- │ │ str ┆ u32 │ ╞═══════╪═════╡ │ blue ┆ 3 │ │ red ┆ 2 │ │ green ┆ 1 │ └───────┴─────┘ """ if name is None: if normalize: name = "proportion" else: name = "count" return self._from_pyexpr( self._pyexpr.value_counts(sort, parallel, name, normalize) ) def unique_counts(self) -> Expr: """ Return a count of the unique values in the order of appearance. This method differs from `value_counts` in that it does not return the values, only the counts and might be faster Examples -------- >>> df = pl.DataFrame( ... { ... "id": ["a", "b", "b", "c", "c", "c"], ... } ... ) >>> df.select( ... [ ... pl.col("id").unique_counts(), ... ] ... ) shape: (3, 1) ┌─────┐ │ id │ │ --- │ │ u32 │ ╞═════╡ │ 1 │ │ 2 │ │ 3 │ └─────┘ """ return self._from_pyexpr(self._pyexpr.unique_counts()) def log(self, base: float = math.e) -> Expr: """ Compute the logarithm to a given base. Parameters ---------- base Given base, defaults to `e` Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3]}) >>> df.select(pl.col("a").log(base=2)) shape: (3, 1) ┌──────────┐ │ a │ │ --- │ │ f64 │ ╞══════════╡ │ 0.0 │ │ 1.0 │ │ 1.584963 │ └──────────┘ """ return self._from_pyexpr(self._pyexpr.log(base)) def log1p(self) -> Expr: """ Compute the natural logarithm of each element plus one. This computes `log(1 + x)` but is more numerically stable for `x` close to zero. Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3]}) >>> df.select(pl.col("a").log1p()) shape: (3, 1) ┌──────────┐ │ a │ │ --- │ │ f64 │ ╞══════════╡ │ 0.693147 │ │ 1.098612 │ │ 1.386294 │ └──────────┘ """ return self._from_pyexpr(self._pyexpr.log1p()) def entropy(self, base: float = math.e, *, normalize: bool = True) -> Expr: """ Computes the entropy. Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. Parameters ---------- base Given base, defaults to `e` normalize Normalize pk if it doesn't sum to 1. Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3]}) >>> df.select(pl.col("a").entropy(base=2)) shape: (1, 1) ┌──────────┐ │ a │ │ --- │ │ f64 │ ╞══════════╡ │ 1.459148 │ └──────────┘ >>> df.select(pl.col("a").entropy(base=2, normalize=False)) shape: (1, 1) ┌───────────┐ │ a │ │ --- │ │ f64 │ ╞═══════════╡ │ -6.754888 │ └───────────┘ """ return self._from_pyexpr(self._pyexpr.entropy(base, normalize)) @unstable() def cumulative_eval( self, expr: Expr, *, min_periods: int = 1, parallel: bool = False ) -> Expr: """ Run an expression over a sliding window that increases `1` slot every iteration. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. Parameters ---------- expr Expression to evaluate min_periods Number of valid values there should be in the window before the expression is evaluated. valid values = `length - null_count` parallel Run in parallel. Don't do this in a group by or another operation that already has much parallelization. Warnings -------- This can be really slow as it can have `O(n^2)` complexity. Don't use this for operations that visit all elements. Examples -------- >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) >>> df.select( ... [ ... pl.col("values").cumulative_eval( ... pl.element().first() - pl.element().last() ** 2 ... ) ... ] ... ) shape: (5, 1) ┌────────┐ │ values │ │ --- │ │ i64 │ ╞════════╡ │ 0 │ │ -3 │ │ -8 │ │ -15 │ │ -24 │ └────────┘ """ return self._from_pyexpr( self._pyexpr.cumulative_eval(expr._pyexpr, min_periods, parallel) ) def set_sorted(self, *, descending: bool = False) -> Expr: """ Flags the expression as 'sorted'. Enables downstream code to user fast paths for sorted arrays. Parameters ---------- descending Whether the `Series` order is descending. Warnings -------- This can lead to incorrect results if the data is NOT sorted!! Use with care! Examples -------- >>> df = pl.DataFrame({"values": [1, 2, 3]}) >>> df.select(pl.col("values").set_sorted().max()) shape: (1, 1) ┌────────┐ │ values │ │ --- │ │ i64 │ ╞════════╡ │ 3 │ └────────┘ """ return self._from_pyexpr(self._pyexpr.set_sorted_flag(descending)) def shrink_dtype(self) -> Expr: """ Shrink numeric columns to the minimal required datatype. Shrink to the dtype needed to fit the extrema of this [`Series`]. This can be used to reduce memory pressure. Examples -------- >>> pl.DataFrame( ... { ... "a": [1, 2, 3], ... "b": [1, 2, 2 << 32], ... "c": [-1, 2, 1 << 30], ... "d": [-112, 2, 112], ... "e": [-112, 2, 129], ... "f": ["a", "b", "c"], ... "g": [0.1, 1.32, 0.12], ... "h": [True, None, False], ... } ... ).select(pl.all().shrink_dtype()) shape: (3, 8) ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ """ return self._from_pyexpr(self._pyexpr.shrink_dtype()) @unstable() def hist( self, bins: IntoExpr | None = None, *, bin_count: int | None = None, include_category: bool = False, include_breakpoint: bool = False, ) -> Expr: """ Bin values into buckets and count their occurrences. .. warning:: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. Parameters ---------- bins Discretizations to make. If None given, we determine the boundaries based on the data. bin_count If no bins provided, this will be used to determine the distance of the bins include_breakpoint Include a column that indicates the upper breakpoint. include_category Include a column that shows the intervals as categories. Returns ------- DataFrame Examples -------- >>> df = pl.DataFrame({"a": [1, 3, 8, 8, 2, 1, 3]}) >>> df.select(pl.col("a").hist(bins=[1, 2, 3])) shape: (4, 1) ┌─────┐ │ a │ │ --- │ │ u32 │ ╞═════╡ │ 2 │ │ 1 │ │ 2 │ │ 2 │ └─────┘ >>> df.select( ... pl.col("a").hist( ... bins=[1, 2, 3], include_breakpoint=True, include_category=True ... ) ... ) shape: (4, 1) ┌───────────────────────┐ │ a │ │ --- │ │ struct[3] │ ╞═══════════════════════╡ │ {1.0,"(-inf, 1.0]",2} │ │ {2.0,"(1.0, 2.0]",1} │ │ {3.0,"(2.0, 3.0]",2} │ │ {inf,"(3.0, inf]",2} │ └───────────────────────┘ """ if bins is not None: if isinstance(bins, list): bins = pl.Series(bins) bins = parse_into_expression(bins) return self._from_pyexpr( self._pyexpr.hist(bins, bin_count, include_category, include_breakpoint) ) def replace( self, old: IntoExpr | Sequence[Any] | Mapping[Any, Any], new: IntoExpr | Sequence[Any] | NoDefault = no_default, *, default: IntoExpr | NoDefault = no_default, return_dtype: PolarsDataType | None = None, ) -> Expr: """ Replace the given values by different values of the same data type. Parameters ---------- old Value or sequence of values to replace. Accepts expression input. Sequences are parsed as Series, other non-expression inputs are parsed as literals. Also accepts a mapping of values to their replacement as syntactic sugar for `replace(old=Series(mapping.keys()), new=Series(mapping.values()))`. new Value or sequence of values to replace by. Accepts expression input. Sequences are parsed as Series, other non-expression inputs are parsed as literals. Length must match the length of `old` or have length 1. default Set values that were not replaced to this value. Defaults to keeping the original value. Accepts expression input. Non-expression inputs are parsed as literals. .. deprecated:: 1.0.0 Use :meth:`replace_strict` instead to set a default while replacing values. return_dtype The data type of the resulting expression. If set to `None` (default), the data type of the original column is preserved. .. deprecated:: 1.0.0 Use :meth:`replace_strict` instead to set a return data type while replacing values, or explicitly call :meth:`cast` on the output. See Also -------- replace_strict str.replace Notes ----- The global string cache must be enabled when replacing categorical values. Examples -------- Replace a single value by another value. Values that were not replaced remain unchanged. >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) >>> df.with_columns(replaced=pl.col("a").replace(2, 100)) shape: (4, 2) ┌─────┬──────────┐ │ a ┆ replaced │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪══════════╡ │ 1 ┆ 1 │ │ 2 ┆ 100 │ │ 2 ┆ 100 │ │ 3 ┆ 3 │ └─────┴──────────┘ Replace multiple values by passing sequences to the `old` and `new` parameters. >>> df.with_columns(replaced=pl.col("a").replace([2, 3], [100, 200])) shape: (4, 2) ┌─────┬──────────┐ │ a ┆ replaced │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪══════════╡ │ 1 ┆ 1 │ │ 2 ┆ 100 │ │ 2 ┆ 100 │ │ 3 ┆ 200 │ └─────┴──────────┘ Passing a mapping with replacements is also supported as syntactic sugar. >>> mapping = {2: 100, 3: 200} >>> df.with_columns(replaced=pl.col("a").replace(mapping)) shape: (4, 2) ┌─────┬──────────┐ │ a ┆ replaced │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪══════════╡ │ 1 ┆ 1 │ │ 2 ┆ 100 │ │ 2 ┆ 100 │ │ 3 ┆ 200 │ └─────┴──────────┘ The original data type is preserved when replacing by values of a different data type. Use :meth:`replace_strict` to replace and change the return data type. >>> df = pl.DataFrame({"a": ["x", "y", "z"]}) >>> mapping = {"x": 1, "y": 2, "z": 3} >>> df.with_columns(replaced=pl.col("a").replace(mapping)) shape: (3, 2) ┌─────┬──────────┐ │ a ┆ replaced │ │ --- ┆ --- │ │ str ┆ str │ ╞═════╪══════════╡ │ x ┆ 1 │ │ y ┆ 2 │ │ z ┆ 3 │ └─────┴──────────┘ Expression input is supported. >>> df = pl.DataFrame({"a": [1, 2, 2, 3], "b": [1.5, 2.5, 5.0, 1.0]}) >>> df.with_columns( ... replaced=pl.col("a").replace( ... old=pl.col("a").max(), ... new=pl.col("b").sum(), ... ) ... ) shape: (4, 3) ┌─────┬─────┬──────────┐ │ a ┆ b ┆ replaced │ │ --- ┆ --- ┆ --- │ │ i64 ┆ f64 ┆ i64 │ ╞═════╪═════╪══════════╡ │ 1 ┆ 1.5 ┆ 1 │ │ 2 ┆ 2.5 ┆ 2 │ │ 2 ┆ 5.0 ┆ 2 │ │ 3 ┆ 1.0 ┆ 10 │ └─────┴─────┴──────────┘ """ if return_dtype is not None: issue_deprecation_warning( "The `return_dtype` parameter for `replace` is deprecated." " Use `replace_strict` instead to set a return data type while replacing values.", version="1.0.0", ) if default is not no_default: issue_deprecation_warning( "The `default` parameter for `replace` is deprecated." " Use `replace_strict` instead to set a default while replacing values.", version="1.0.0", ) return self.replace_strict( old, new, default=default, return_dtype=return_dtype ) if new is no_default: if not isinstance(old, Mapping): msg = ( "`new` argument is required if `old` argument is not a Mapping type" ) raise TypeError(msg) new = pl.Series(old.values()) old = pl.Series(old.keys()) else: if isinstance(old, Sequence) and not isinstance(old, (str, pl.Series)): old = pl.Series(old) if isinstance(new, Sequence) and not isinstance(new, (str, pl.Series)): new = pl.Series(new) old = parse_into_expression(old, str_as_lit=True) # type: ignore[arg-type] new = parse_into_expression(new, str_as_lit=True) result = self._from_pyexpr(self._pyexpr.replace(old, new)) if return_dtype is not None: result = result.cast(return_dtype) return result def replace_strict( self, old: IntoExpr | Sequence[Any] | Mapping[Any, Any], new: IntoExpr | Sequence[Any] | NoDefault = no_default, *, default: IntoExpr | NoDefault = no_default, return_dtype: PolarsDataType | None = None, ) -> Expr: """ Replace all values by different values. Parameters ---------- old Value or sequence of values to replace. Accepts expression input. Sequences are parsed as Series, other non-expression inputs are parsed as literals. Also accepts a mapping of values to their replacement as syntactic sugar for `replace_all(old=Series(mapping.keys()), new=Series(mapping.values()))`. new Value or sequence of values to replace by. Accepts expression input. Sequences are parsed as Series, other non-expression inputs are parsed as literals. Length must match the length of `old` or have length 1. default Set values that were not replaced to this value. If no default is specified, (default), an error is raised if any values were not replaced. Accepts expression input. Non-expression inputs are parsed as literals. return_dtype The data type of the resulting expression. If set to `None` (default), the data type is determined automatically based on the other inputs. Raises ------ InvalidOperationError If any non-null values in the original column were not replaced, and no `default` was specified. See Also -------- replace str.replace Notes ----- The global string cache must be enabled when replacing categorical values. Examples -------- Replace values by passing sequences to the `old` and `new` parameters. >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) >>> df.with_columns( ... replaced=pl.col("a").replace_strict([1, 2, 3], [100, 200, 300]) ... ) shape: (4, 2) ┌─────┬──────────┐ │ a ┆ replaced │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪══════════╡ │ 1 ┆ 100 │ │ 2 ┆ 200 │ │ 2 ┆ 200 │ │ 3 ┆ 300 │ └─────┴──────────┘ Passing a mapping with replacements is also supported as syntactic sugar. >>> mapping = {1: 100, 2: 200, 3: 300} >>> df.with_columns(replaced=pl.col("a").replace_strict(mapping)) shape: (4, 2) ┌─────┬──────────┐ │ a ┆ replaced │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪══════════╡ │ 1 ┆ 100 │ │ 2 ┆ 200 │ │ 2 ┆ 200 │ │ 3 ┆ 300 │ └─────┴──────────┘ By default, an error is raised if any non-null values were not replaced. Specify a default to set all values that were not matched. >>> mapping = {2: 200, 3: 300} >>> df.with_columns( ... replaced=pl.col("a").replace_strict(mapping) ... ) # doctest: +SKIP Traceback (most recent call last): ... polars.exceptions.InvalidOperationError: incomplete mapping specified for `replace_strict` >>> df.with_columns(replaced=pl.col("a").replace_strict(mapping, default=-1)) shape: (4, 2) ┌─────┬──────────┐ │ a ┆ replaced │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪══════════╡ │ 1 ┆ -1 │ │ 2 ┆ 200 │ │ 2 ┆ 200 │ │ 3 ┆ 300 │ └─────┴──────────┘ Replacing by values of a different data type sets the return type based on a combination of the `new` data type and the `default` data type. >>> df = pl.DataFrame({"a": ["x", "y", "z"]}) >>> mapping = {"x": 1, "y": 2, "z": 3} >>> df.with_columns(replaced=pl.col("a").replace_strict(mapping)) shape: (3, 2) ┌─────┬──────────┐ │ a ┆ replaced │ │ --- ┆ --- │ │ str ┆ i64 │ ╞═════╪══════════╡ │ x ┆ 1 │ │ y ┆ 2 │ │ z ┆ 3 │ └─────┴──────────┘ >>> df.with_columns(replaced=pl.col("a").replace_strict(mapping, default="x")) shape: (3, 2) ┌─────┬──────────┐ │ a ┆ replaced │ │ --- ┆ --- │ │ str ┆ str │ ╞═════╪══════════╡ │ x ┆ 1 │ │ y ┆ 2 │ │ z ┆ 3 │ └─────┴──────────┘ Set the `return_dtype` parameter to control the resulting data type directly. >>> df.with_columns( ... replaced=pl.col("a").replace_strict(mapping, return_dtype=pl.UInt8) ... ) shape: (3, 2) ┌─────┬──────────┐ │ a ┆ replaced │ │ --- ┆ --- │ │ str ┆ u8 │ ╞═════╪══════════╡ │ x ┆ 1 │ │ y ┆ 2 │ │ z ┆ 3 │ └─────┴──────────┘ Expression input is supported for all parameters. >>> df = pl.DataFrame({"a": [1, 2, 2, 3], "b": [1.5, 2.5, 5.0, 1.0]}) >>> df.with_columns( ... replaced=pl.col("a").replace_strict( ... old=pl.col("a").max(), ... new=pl.col("b").sum(), ... default=pl.col("b"), ... ) ... ) shape: (4, 3) ┌─────┬─────┬──────────┐ │ a ┆ b ┆ replaced │ │ --- ┆ --- ┆ --- │ │ i64 ┆ f64 ┆ f64 │ ╞═════╪═════╪══════════╡ │ 1 ┆ 1.5 ┆ 1.5 │ │ 2 ┆ 2.5 ┆ 2.5 │ │ 2 ┆ 5.0 ┆ 5.0 │ │ 3 ┆ 1.0 ┆ 10.0 │ └─────┴─────┴──────────┘ """ # noqa: W505 if new is no_default: if not isinstance(old, Mapping): msg = ( "`new` argument is required if `old` argument is not a Mapping type" ) raise TypeError(msg) new = pl.Series(old.values()) old = pl.Series(old.keys()) old = parse_into_expression(old, str_as_lit=True, list_as_series=True) # type: ignore[arg-type] new = parse_into_expression(new, str_as_lit=True, list_as_series=True) # type: ignore[arg-type] default = ( None if default is no_default else parse_into_expression(default, str_as_lit=True) ) return self._from_pyexpr( self._pyexpr.replace_strict(old, new, default, return_dtype) ) def bitwise_count_ones(self) -> Expr: """Evaluate the number of set bits.""" return self._from_pyexpr(self._pyexpr.bitwise_count_ones()) def bitwise_count_zeros(self) -> Expr: """Evaluate the number of unset bits.""" return self._from_pyexpr(self._pyexpr.bitwise_count_zeros()) def bitwise_leading_ones(self) -> Expr: """Evaluate the number most-significant set bits before seeing an unset bit.""" return self._from_pyexpr(self._pyexpr.bitwise_leading_ones()) def bitwise_leading_zeros(self) -> Expr: """Evaluate the number most-significant unset bits before seeing a set bit.""" return self._from_pyexpr(self._pyexpr.bitwise_leading_zeros()) def bitwise_trailing_ones(self) -> Expr: """Evaluate the number least-significant set bits before seeing an unset bit.""" return self._from_pyexpr(self._pyexpr.bitwise_trailing_ones()) def bitwise_trailing_zeros(self) -> Expr: """Evaluate the number least-significant unset bits before seeing a set bit.""" return self._from_pyexpr(self._pyexpr.bitwise_trailing_zeros()) def bitwise_and(self) -> Expr: """Perform an aggregation of bitwise ANDs.""" return self._from_pyexpr(self._pyexpr.bitwise_and()) def bitwise_or(self) -> Expr: """Perform an aggregation of bitwise ORs.""" return self._from_pyexpr(self._pyexpr.bitwise_or()) def bitwise_xor(self) -> Expr: """Perform an aggregation of bitwise XORs.""" return self._from_pyexpr(self._pyexpr.bitwise_xor()) @deprecate_function( "Use `polars.plugins.register_plugin_function` instead.", version="0.20.16" ) def register_plugin( self, *, lib: str, symbol: str, args: list[IntoExpr] | None = None, kwargs: dict[Any, Any] | None = None, is_elementwise: bool = False, input_wildcard_expansion: bool = False, returns_scalar: bool = False, cast_to_supertypes: bool = False, pass_name_to_apply: bool = False, changes_length: bool = False, ) -> Expr: """ Register a plugin function. .. deprecated:: 0.20.16 Use :func:`polars.plugins.register_plugin_function` instead. See the `user guide <https://docs.pola.rs/user-guide/expressions/plugins/>`_ for more information about plugins. Warnings -------- This method is deprecated. Use the new `polars.plugins.register_plugin_function` function instead. This is highly unsafe as this will call the C function loaded by `lib::symbol`. The parameters you set dictate how Polars will handle the function. Make sure they are correct! Parameters ---------- lib Library to load. symbol Function to load. args Arguments (other than self) passed to this function. These arguments have to be of type Expression. kwargs Non-expression arguments. They must be JSON serializable. is_elementwise If the function only operates on scalars this will trigger fast paths. input_wildcard_expansion Expand expressions as input of this function. returns_scalar Automatically explode on unit length if it ran as final aggregation. this is the case for aggregations like `sum`, `min`, `covariance` etc. cast_to_supertypes Cast the input datatypes to their supertype. pass_name_to_apply if set, then the `Series` passed to the function in the group_by operation will ensure the name is set. This is an extra heap allocation per group. changes_length For example a `unique` or a `slice` """ from polars.plugins import register_plugin_function if args is None: args = [self] else: args = [self, *list(args)] return register_plugin_function( plugin_path=lib, function_name=symbol, args=args, kwargs=kwargs, is_elementwise=is_elementwise, changes_length=changes_length, returns_scalar=returns_scalar, cast_to_supertype=cast_to_supertypes, input_wildcard_expansion=input_wildcard_expansion, pass_name_to_apply=pass_name_to_apply, ) @classmethod def from_json(cls, value: str) -> Expr: """ Read an expression from a JSON encoded string to construct an Expression. .. deprecated:: 0.20.11 This method has been renamed to :meth:`deserialize`. Note that the new method operates on file-like inputs rather than strings. Enclose your input in `io.StringIO` to keep the same behavior. Parameters ---------- value JSON encoded string value """ issue_deprecation_warning( "`Expr.from_json` is deprecated. It has been renamed to `Expr.deserialize`." " Note that the new method operates on file-like inputs rather than strings." " Enclose your input in `io.StringIO` to keep the same behavior.", version="0.20.11", ) return cls.deserialize(StringIO(value), format="json") @property def bin(self) -> ExprBinaryNameSpace: """ Create an object namespace of all binary related methods. See the individual method pages for full details """ return ExprBinaryNameSpace(self) @property def cat(self) -> ExprCatNameSpace: """ Create an object namespace of all categorical related methods. See the individual method pages for full details Examples -------- >>> df = pl.DataFrame({"values": ["a", "b"]}).select( ... pl.col("values").cast(pl.Categorical) ... ) >>> df.select(pl.col("values").cat.get_categories()) shape: (2, 1) ┌────────┐ │ values │ │ --- │ │ str │ ╞════════╡ │ a │ │ b │ └────────┘ """ return ExprCatNameSpace(self) @property def dt(self) -> ExprDateTimeNameSpace: """Create an object namespace of all datetime related methods.""" return ExprDateTimeNameSpace(self) # Keep the `list` and `str` properties below at the end of the definition of Expr, # as to not confuse mypy with the type annotation `str` and `list` @property def list(self) -> ExprListNameSpace: """ Create an object namespace of all list related methods. See the individual method pages for full details. """ return ExprListNameSpace(self) @property def arr(self) -> ExprArrayNameSpace: """ Create an object namespace of all array related methods. See the individual method pages for full details. """ return ExprArrayNameSpace(self) @property def meta(self) -> ExprMetaNameSpace: """ Create an object namespace of all meta related expression methods. This can be used to modify and traverse existing expressions. """ return ExprMetaNameSpace(self) @property def name(self) -> ExprNameNameSpace: """ Create an object namespace of all expressions that modify expression names. See the individual method pages for full details. """ return ExprNameNameSpace(self) @property def str(self) -> ExprStringNameSpace: """ Create an object namespace of all string related methods. See the individual method pages for full details. Examples -------- >>> df = pl.DataFrame({"letters": ["a", "b"]}) >>> df.select(pl.col("letters").str.to_uppercase()) shape: (2, 1) ┌─────────┐ │ letters │ │ --- │ │ str │ ╞═════════╡ │ A │ │ B │ └─────────┘ """ return ExprStringNameSpace(self) @property def struct(self) -> ExprStructNameSpace: """ Create an object namespace of all struct related methods. See the individual method pages for full details. Examples -------- >>> df = ( ... pl.DataFrame( ... { ... "int": [1, 2], ... "str": ["a", "b"], ... "bool": [True, None], ... "list": [[1, 2], [3]], ... } ... ) ... .to_struct("my_struct") ... .to_frame() ... ) >>> df.select(pl.col("my_struct").struct.field("str")) shape: (2, 1) ┌─────┐ │ str │ │ --- │ │ str │ ╞═════╡ │ a │ │ b │ └─────┘ """ return ExprStructNameSpace(self)
def _prepare_alpha( com: float | int | None = None, span: float | int | None = None, half_life: float | int | None = None, alpha: float | int | None = None, ) -> float: """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" if sum((param is not None) for param in (com, span, half_life, alpha)) > 1: msg = ( "parameters `com`, `span`, `half_life`, and `alpha` are mutually exclusive" ) raise ValueError(msg) if com is not None: if com < 0.0: msg = f"require `com` >= 0 (found {com!r})" raise ValueError(msg) alpha = 1.0 / (1.0 + com) elif span is not None: if span < 1.0: msg = f"require `span` >= 1 (found {span!r})" raise ValueError(msg) alpha = 2.0 / (span + 1.0) elif half_life is not None: if half_life <= 0.0: msg = f"require `half_life` > 0 (found {half_life!r})" raise ValueError(msg) alpha = 1.0 - math.exp(-math.log(2.0) / half_life) elif alpha is None: msg = "one of `com`, `span`, `half_life`, or `alpha` must be set" raise ValueError(msg) elif not (0 < alpha <= 1): msg = f"require 0 < `alpha` <= 1 (found {alpha!r})" raise ValueError(msg) return alpha def _prepare_rolling_by_window_args(window_size: timedelta | str) -> str: if isinstance(window_size, timedelta): window_size = parse_as_duration_string(window_size) return window_size