from __future__ import annotations
import contextlib
import math
import os
import typing
from datetime import date, datetime, time, timedelta
from typing import (
TYPE_CHECKING,
Any,
Callable,
Collection,
Generator,
Iterable,
NoReturn,
Sequence,
Union,
overload,
)
import polars._reexport as pl
from polars import functions as F
from polars.datatypes import (
FLOAT_DTYPES,
INTEGER_DTYPES,
NUMERIC_DTYPES,
SIGNED_INTEGER_DTYPES,
TEMPORAL_DTYPES,
UNSIGNED_INTEGER_DTYPES,
Boolean,
Categorical,
Date,
Datetime,
Decimal,
Duration,
Float32,
Float64,
Int8,
Int16,
Int32,
Int64,
List,
Object,
Time,
UInt8,
UInt16,
UInt32,
UInt64,
Unknown,
Utf8,
dtype_to_ctype,
is_polars_dtype,
maybe_cast,
numpy_char_code_to_dtype,
py_type_to_dtype,
supported_numpy_char_code,
)
from polars.dependencies import (
_PYARROW_AVAILABLE,
_check_for_numpy,
_check_for_pandas,
_check_for_pyarrow,
)
from polars.dependencies import numpy as np
from polars.dependencies import pandas as pd
from polars.dependencies import pyarrow as pa
from polars.exceptions import ShapeError
from polars.series.binary import BinaryNameSpace
from polars.series.categorical import CatNameSpace
from polars.series.datetime import DateTimeNameSpace
from polars.series.list import ListNameSpace
from polars.series.string import StringNameSpace
from polars.series.struct import StructNameSpace
from polars.series.utils import expr_dispatch, get_ffi_func
from polars.slice import PolarsSlice
from polars.utils._construction import (
arrow_to_pyseries,
iterable_to_pyseries,
numpy_to_pyseries,
pandas_to_pyseries,
sequence_to_pyseries,
series_to_pyseries,
)
from polars.utils._wrap import wrap_df
from polars.utils.convert import (
_date_to_pl_date,
_datetime_to_pl_timestamp,
_time_to_pl_time,
)
from polars.utils.decorators import deprecated_alias
from polars.utils.meta import get_index_type
from polars.utils.various import (
_is_generator,
is_int_sequence,
parse_version,
range_to_series,
range_to_slice,
scale_bytes,
sphinx_accessor,
)
with contextlib.suppress(ImportError): # Module not available when building docs
from polars.polars import PyDataFrame, PySeries
if TYPE_CHECKING:
import sys
from polars.dataframe import DataFrame
from polars.expr.expr import Expr
from polars.series._numpy import SeriesView
from polars.type_aliases import (
ClosedInterval,
ComparisonOperator,
FillNullStrategy,
InterpolationMethod,
IntoExpr,
NullBehavior,
OneOrMoreDataTypes,
PolarsDataType,
PythonLiteral,
RankMethod,
RollingInterpolationMethod,
SearchSortedSide,
SizeUnit,
TimeUnit,
)
if sys.version_info >= (3, 11):
from typing import Self
else:
from typing_extensions import Self
elif os.getenv("BUILDING_SPHINX_DOCS"):
property = sphinx_accessor
ArrayLike = Union[
Sequence[Any],
"Series",
"pa.Array",
"pa.ChunkedArray",
"np.ndarray",
"pd.Series",
"pd.DatetimeIndex",
]
[docs]@expr_dispatch
class Series:
"""
A Series represents a single column in a polars DataFrame.
Parameters
----------
name : str, default None
Name of the series. Will be used as a column name when used in a DataFrame.
When not specified, name is set to an empty string.
values : ArrayLike, default None
One-dimensional data in various forms. Supported are: Sequence, Series,
pyarrow Array, and numpy ndarray.
dtype : DataType, default None
Polars dtype of the Series data. If not specified, the dtype is inferred.
strict
Throw error on numeric overflow.
nan_to_null
In case a numpy array is used to create this Series, indicate how to deal
with np.nan values. (This parameter is a no-op on non-numpy data).
dtype_if_empty=dtype_if_empty : DataType, default None
If no dtype is specified and values contains None, an empty list, or a
list with only None values, set the Polars dtype of the Series data.
If not specified, Float32 is used in those cases.
Examples
--------
Constructing a Series by specifying name and values positionally:
>>> s = pl.Series("a", [1, 2, 3])
>>> s
shape: (3,)
Series: 'a' [i64]
[
1
2
3
]
Notice that the dtype is automatically inferred as a polars Int64:
>>> s.dtype
Int64
Constructing a Series with a specific dtype:
>>> s2 = pl.Series("a", [1, 2, 3], dtype=pl.Float32)
>>> s2
shape: (3,)
Series: 'a' [f32]
[
1.0
2.0
3.0
]
It is possible to construct a Series with values as the first positional argument.
This syntax considered an anti-pattern, but it can be useful in certain
scenarios. You must specify any other arguments through keywords.
>>> s3 = pl.Series([1, 2, 3])
>>> s3
shape: (3,)
Series: '' [i64]
[
1
2
3
]
"""
_s: PySeries = None
_accessors: set[str] = {"arr", "cat", "dt", "str", "bin", "struct"}
def __init__(
self,
name: str | ArrayLike | None = None,
values: ArrayLike | None = None,
dtype: PolarsDataType | None = None,
*,
strict: bool = True,
nan_to_null: bool = False,
dtype_if_empty: PolarsDataType | None = None,
):
# If 'Unknown' treat as None to attempt inference
if dtype == Unknown:
dtype = None
# Raise early error on invalid dtype
if (
dtype is not None
and not is_polars_dtype(dtype)
and py_type_to_dtype(dtype, raise_unmatched=False) is None
):
raise ValueError(
f"Given dtype: '{dtype}' is not a valid Polars data type and cannot be converted into one."
)
# Handle case where values are passed as the first argument
if name is None:
name = ""
elif not isinstance(name, str):
if values is None:
values = name
name = ""
else:
raise ValueError("Series name must be a string.")
if values is None:
self._s = sequence_to_pyseries(
name, [], dtype=dtype, dtype_if_empty=dtype_if_empty
)
elif isinstance(values, Series):
self._s = series_to_pyseries(name, values)
elif isinstance(values, range):
self._s = range_to_series(name, values, dtype=dtype)._s
elif isinstance(values, Sequence):
self._s = sequence_to_pyseries(
name,
values,
dtype=dtype,
strict=strict,
dtype_if_empty=dtype_if_empty,
nan_to_null=nan_to_null,
)
elif _check_for_numpy(values) and isinstance(values, np.ndarray):
self._s = numpy_to_pyseries(name, values, strict, nan_to_null)
if values.dtype.type == np.datetime64:
# cast to appropriate dtype, handling NaT values
dtype = _resolve_datetime_dtype(dtype, values.dtype)
if dtype is not None:
self._s = (
self.cast(dtype)
.set_at_idx(np.argwhere(np.isnat(values)).flatten(), None)
._s
)
return
if dtype is not None:
self._s = self.cast(dtype, strict=True)._s
elif _check_for_pyarrow(values) and isinstance(
values, (pa.Array, pa.ChunkedArray)
):
self._s = arrow_to_pyseries(name, values)
elif _check_for_pandas(values) and isinstance(
values, (pd.Series, pd.DatetimeIndex)
):
self._s = pandas_to_pyseries(name, values)
elif _is_generator(values):
self._s = iterable_to_pyseries(
name,
values,
dtype=dtype,
strict=strict,
dtype_if_empty=dtype_if_empty,
)
else:
raise ValueError(
f"Series constructor called with unsupported type; got {type(values)}"
)
@classmethod
def _from_pyseries(cls, pyseries: PySeries) -> Self:
series = cls.__new__(cls)
series._s = pyseries
return series
@classmethod
def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = True) -> Self:
"""Construct a Series from an Arrow Array."""
return cls._from_pyseries(arrow_to_pyseries(name, values, rechunk))
@classmethod
def _from_pandas(
cls,
name: str,
values: pd.Series | pd.DatetimeIndex,
*,
nan_to_null: bool = True,
) -> Self:
"""Construct a Series from a pandas Series or DatetimeIndex."""
return cls._from_pyseries(
pandas_to_pyseries(name, values, nan_to_null=nan_to_null)
)
@classmethod
def _repeat(
cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType
) -> Self:
return cls._from_pyseries(PySeries.repeat(name, val, n, dtype))
def _get_ptr(self) -> int:
"""
Get a pointer to the start of the values buffer of a numeric Series.
This will raise an error if the
``Series`` contains multiple chunks
"""
return self._s.get_ptr()
@property
def dtype(self) -> PolarsDataType:
"""
Get the data type of this Series.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.dtype
Int64
"""
return self._s.dtype()
@property
def flags(self) -> dict[str, bool]:
"""
Get flags that are set on the Series.
Returns
-------
Dictionary containing the flag name and the value
"""
out = {
"SORTED_ASC": self._s.is_sorted_ascending_flag(),
"SORTED_DESC": self._s.is_sorted_descending_flag(),
}
if self.dtype == List:
out["FAST_EXPLODE"] = self._s.can_fast_explode_flag()
return out
@property
def inner_dtype(self) -> PolarsDataType | None:
"""
Get the inner dtype in of a List typed Series.
Returns
-------
DataType
"""
return self._s.inner_dtype()
@property
def name(self) -> str:
"""Get the name of this Series."""
return self._s.name()
@property
def shape(self) -> tuple[int]:
"""Shape of this Series."""
return (self._s.len(),)
@property
def time_unit(self) -> TimeUnit | None:
"""Get the time unit of underlying Datetime Series as {"ns", "us", "ms"}."""
return self._s.time_unit()
def __bool__(self) -> NoReturn:
raise ValueError(
"The truth value of a Series is ambiguous. Hint: use '&' or '|' to chain "
"Series boolean results together, not and/or; to check if a Series "
"contains any values, use 'is_empty()'"
)
def __getstate__(self) -> Any:
return self._s.__getstate__()
def __setstate__(self, state: Any) -> None:
self._s = sequence_to_pyseries("", [], Float32)
self._s.__setstate__(state)
def __str__(self) -> str:
s_repr: str = self._s.as_str()
return s_repr.replace("Series", f"{self.__class__.__name__}", 1)
def __repr__(self) -> str:
return self.__str__()
def __len__(self) -> int:
return self.len()
def __and__(self, other: Series) -> Self:
if not isinstance(other, Series):
other = Series([other])
return self._from_pyseries(self._s.bitand(other._s))
def __rand__(self, other: Series) -> Series:
return self.__and__(other)
def __or__(self, other: Series) -> Self:
if not isinstance(other, Series):
other = Series([other])
return self._from_pyseries(self._s.bitor(other._s))
def __ror__(self, other: Series) -> Self:
return self.__or__(other)
def __xor__(self, other: Series) -> Self:
if not isinstance(other, Series):
other = Series([other])
return self._from_pyseries(self._s.bitxor(other._s))
def __rxor__(self, other: Series) -> Series:
return self.__xor__(other)
def _comp(self, other: Any, op: ComparisonOperator) -> Self:
# special edge-case; boolean broadcast series (eq/neq) is its own result
if self.dtype == Boolean and isinstance(other, bool) and op in ("eq", "neq"):
if (other is True and op == "eq") or (other is False and op == "neq"):
return self.clone()
elif (other is False and op == "eq") or (other is True and op == "neq"):
return ~self
if isinstance(other, datetime) and self.dtype == Datetime:
ts = _datetime_to_pl_timestamp(other, self.time_unit)
f = get_ffi_func(op + "_<>", Int64, self._s)
assert f is not None
return self._from_pyseries(f(ts))
elif isinstance(other, time) and self.dtype == Time:
d = _time_to_pl_time(other)
f = get_ffi_func(op + "_<>", Int64, self._s)
assert f is not None
return self._from_pyseries(f(d))
elif isinstance(other, date) and self.dtype == Date:
d = _date_to_pl_date(other)
f = get_ffi_func(op + "_<>", Int32, self._s)
assert f is not None
return self._from_pyseries(f(d))
elif self.dtype == Categorical and not isinstance(other, Series):
other = Series([other])
if isinstance(other, Sequence) and not isinstance(other, str):
other = Series("", other, dtype_if_empty=self.dtype)
if isinstance(other, Series):
return self._from_pyseries(getattr(self._s, op)(other._s))
if other is not None:
other = maybe_cast(other, self.dtype, self.time_unit)
f = get_ffi_func(op + "_<>", self.dtype, self._s)
if f is None:
return NotImplemented
return self._from_pyseries(f(other))
@overload # type: ignore[override]
def __eq__(self, other: Expr) -> Expr: # type: ignore[misc]
...
@overload
def __eq__(self, other: Any) -> Self:
...
def __eq__(self, other: Any) -> Self | Expr:
if isinstance(other, pl.Expr):
return F.lit(self).__eq__(other)
return self._comp(other, "eq")
@overload # type: ignore[override]
def __ne__(self, other: Expr) -> Expr: # type: ignore[misc]
...
@overload
def __ne__(self, other: Any) -> Self:
...
def __ne__(self, other: Any) -> Self | Expr:
if isinstance(other, pl.Expr):
return F.lit(self).__ne__(other)
return self._comp(other, "neq")
@overload
def __gt__(self, other: Expr) -> Expr: # type: ignore[misc]
...
@overload
def __gt__(self, other: Any) -> Self:
...
def __gt__(self, other: Any) -> Self | Expr:
if isinstance(other, pl.Expr):
return F.lit(self).__gt__(other)
return self._comp(other, "gt")
@overload
def __lt__(self, other: Expr) -> Expr: # type: ignore[misc]
...
@overload
def __lt__(self, other: Any) -> Self:
...
def __lt__(self, other: Any) -> Self | Expr:
if isinstance(other, pl.Expr):
return F.lit(self).__lt__(other)
return self._comp(other, "lt")
@overload
def __ge__(self, other: Expr) -> Expr: # type: ignore[misc]
...
@overload
def __ge__(self, other: Any) -> Self:
...
def __ge__(self, other: Any) -> Self | Expr:
if isinstance(other, pl.Expr):
return F.lit(self).__ge__(other)
return self._comp(other, "gt_eq")
@overload
def __le__(self, other: Expr) -> Expr: # type: ignore[misc]
...
@overload
def __le__(self, other: Any) -> Self:
...
def __le__(self, other: Any) -> Self | Expr:
if isinstance(other, pl.Expr):
return F.lit(self).__le__(other)
return self._comp(other, "lt_eq")
def le(self, other: Any) -> Self | Expr:
"""Method equivalent of operator expression ``series <= other``."""
return self.__le__(other)
def lt(self, other: Any) -> Self | Expr:
"""Method equivalent of operator expression ``series < other``."""
return self.__lt__(other)
def eq(self, other: Any) -> Self | Expr:
"""Method equivalent of operator expression ``series == other``."""
return self.__eq__(other)
def ne(self, other: Any) -> Self | Expr:
"""Method equivalent of operator expression ``series != other``."""
return self.__ne__(other)
def ge(self, other: Any) -> Self | Expr:
"""Method equivalent of operator expression ``series >= other``."""
return self.__ge__(other)
def gt(self, other: Any) -> Self | Expr:
"""Method equivalent of operator expression ``series > other``."""
return self.__gt__(other)
def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self:
if isinstance(other, pl.Expr):
# expand pl.lit, pl.datetime, pl.duration Exprs to compatible Series
other = self.to_frame().select(other).to_series()
if isinstance(other, Series):
return self._from_pyseries(getattr(self._s, op_s)(other._s))
if _check_for_numpy(other) and isinstance(other, np.ndarray):
return self._from_pyseries(getattr(self._s, op_s)(Series(other)._s))
if (
isinstance(other, (float, date, datetime, timedelta, str))
and not self.is_float()
):
_s = sequence_to_pyseries(self.name, [other])
if "rhs" in op_ffi:
return self._from_pyseries(getattr(_s, op_s)(self._s))
else:
return self._from_pyseries(getattr(self._s, op_s)(_s))
else:
other = maybe_cast(other, self.dtype, self.time_unit)
f = get_ffi_func(op_ffi, self.dtype, self._s)
if f is None:
raise ValueError(
f"cannot do arithmetic with series of dtype: {self.dtype} and argument"
f" of type: {type(other)}"
)
return self._from_pyseries(f(other))
@overload
def __add__(self, other: DataFrame) -> DataFrame: # type: ignore[misc]
...
@overload
def __add__(self, other: Expr) -> Expr: # type: ignore[misc]
...
@overload
def __add__(self, other: Any) -> Self:
...
def __add__(self, other: Any) -> Self | DataFrame | Expr:
if isinstance(other, str):
other = Series("", [other])
elif isinstance(other, pl.DataFrame):
return other + self
elif isinstance(other, pl.Expr):
return F.lit(self) + other
return self._arithmetic(other, "add", "add_<>")
@overload
def __sub__(self, other: Expr) -> Expr: # type: ignore[misc]
...
@overload
def __sub__(self, other: Any) -> Self:
...
def __sub__(self, other: Any) -> Self | Expr:
if isinstance(other, pl.Expr):
return F.lit(self) - other
return self._arithmetic(other, "sub", "sub_<>")
@overload
def __truediv__(self, other: Expr) -> Expr: # type: ignore[misc]
...
@overload
def __truediv__(self, other: Any) -> Series:
...
def __truediv__(self, other: Any) -> Series | Expr:
if isinstance(other, pl.Expr):
return F.lit(self) / other
if self.is_temporal():
raise ValueError("first cast to integer before dividing datelike dtypes")
# this branch is exactly the floordiv function without rounding the floats
if self.is_float() or self.dtype == Decimal:
return self._arithmetic(other, "div", "div_<>")
return self.cast(Float64) / other
# python 3.7 is not happy. Remove this when we finally ditch that
@typing.no_type_check
def __floordiv__(self, other: Any) -> Series:
if isinstance(other, pl.Expr):
return F.lit(self).__floordiv__(other)
if self.is_temporal():
raise ValueError("first cast to integer before dividing datelike dtypes")
if not isinstance(other, pl.Expr):
other = F.lit(other)
return self.to_frame().select(F.col(self.name) // other).to_series()
def __invert__(self) -> Self:
if self.dtype == Boolean:
return self._from_pyseries(self._s._not())
return NotImplemented
@overload
def __mul__(self, other: Expr) -> Expr: # type: ignore[misc]
...
@overload
def __mul__(self, other: DataFrame) -> DataFrame: # type: ignore[misc]
...
@overload
def __mul__(self, other: Any) -> Series:
...
def __mul__(self, other: Any) -> Series | DataFrame | Expr:
if isinstance(other, pl.Expr):
return F.lit(self) * other
if self.is_temporal():
raise ValueError("first cast to integer before multiplying datelike dtypes")
elif isinstance(other, pl.DataFrame):
return other * self
else:
return self._arithmetic(other, "mul", "mul_<>")
@overload
def __mod__(self, other: Expr) -> Expr: # type: ignore[misc]
...
@overload
def __mod__(self, other: Any) -> Series:
...
def __mod__(self, other: Any) -> Series | Expr:
if isinstance(other, pl.Expr):
return F.lit(self).__mod__(other)
if self.is_temporal():
raise ValueError(
"first cast to integer before applying modulo on datelike dtypes"
)
return self._arithmetic(other, "rem", "rem_<>")
def __rmod__(self, other: Any) -> Series:
if self.is_temporal():
raise ValueError(
"first cast to integer before applying modulo on datelike dtypes"
)
return self._arithmetic(other, "rem", "rem_<>_rhs")
def __radd__(self, other: Any) -> Series:
if isinstance(other, str):
return (other + self.to_frame()).to_series()
return self._arithmetic(other, "add", "add_<>_rhs")
def __rsub__(self, other: Any) -> Series:
return self._arithmetic(other, "sub", "sub_<>_rhs")
def __rtruediv__(self, other: Any) -> Series:
if self.is_temporal():
raise ValueError("first cast to integer before dividing datelike dtypes")
if self.is_float():
self.__rfloordiv__(other)
if isinstance(other, int):
other = float(other)
return self.cast(Float64).__rfloordiv__(other)
def __rfloordiv__(self, other: Any) -> Series:
if self.is_temporal():
raise ValueError("first cast to integer before dividing datelike dtypes")
return self._arithmetic(other, "div", "div_<>_rhs")
def __rmul__(self, other: Any) -> Series:
if self.is_temporal():
raise ValueError("first cast to integer before multiplying datelike dtypes")
return self._arithmetic(other, "mul", "mul_<>")
def __pow__(self, exponent: int | float | Series) -> Series:
return self.pow(exponent)
def __rpow__(self, other: Any) -> Series:
if self.is_temporal():
raise ValueError(
"first cast to integer before raising datelike dtypes to a power"
)
return self.to_frame().select(other ** F.col(self.name)).to_series()
def __matmul__(self, other: Any) -> float | Series | None:
if isinstance(other, Sequence) or (
_check_for_numpy(other) and isinstance(other, np.ndarray)
):
other = Series(other)
# elif isinstance(other, pl.DataFrame):
# return other.__rmatmul__(self) # type: ignore[return-value]
return self.dot(other)
def __rmatmul__(self, other: Any) -> float | Series | None:
if isinstance(other, Sequence) or (
_check_for_numpy(other) and isinstance(other, np.ndarray)
):
other = Series(other)
return other.dot(self)
def __neg__(self) -> Series:
return 0 - self
def __pos__(self) -> Series:
return 0 + self
def __abs__(self) -> Series:
return self.abs()
def __copy__(self) -> Self:
return self.clone()
def __deepcopy__(self, memo: None = None) -> Self:
return self.clone()
def __iter__(self) -> Generator[Any, None, None]:
if self.dtype == List:
# TODO: either make a change and return py-native list data here, or find
# a faster way to return nested/List series; sequential 'get_idx' calls
# make this path a lot slower (~10x) than it needs to be.
get_idx = self._s.get_idx
for idx in range(0, self.len()):
yield get_idx(idx)
else:
buffer_size = 25_000
for offset in range(0, self.len(), buffer_size):
yield from self.slice(offset, buffer_size).to_list()
def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series:
# pl.UInt32 (polars) or pl.UInt64 (polars_u64_idx).
idx_type = get_index_type()
if isinstance(idxs, Series):
if idxs.dtype == idx_type:
return idxs
if idxs.dtype in {
UInt8,
UInt16,
UInt64 if idx_type == UInt32 else UInt32,
Int8,
Int16,
Int32,
Int64,
}:
if idx_type == UInt32:
if idxs.dtype in {Int64, UInt64}:
if idxs.max() >= 2**32: # type: ignore[operator]
raise ValueError(
"Index positions should be smaller than 2^32."
)
if idxs.dtype == Int64:
if idxs.min() < -(2**32): # type: ignore[operator]
raise ValueError(
"Index positions should be bigger than -2^32 + 1."
)
if idxs.dtype in SIGNED_INTEGER_DTYPES:
if idxs.min() < 0: # type: ignore[operator]
if idx_type == UInt32:
if idxs.dtype in {Int8, Int16}:
idxs = idxs.cast(Int32)
else:
if idxs.dtype in {Int8, Int16, Int32}:
idxs = idxs.cast(Int64)
# Update negative indexes to absolute indexes.
return (
idxs.to_frame()
.select(
F.when(F.col(idxs.name) < 0)
.then(self.len() + F.col(idxs.name))
.otherwise(F.col(idxs.name))
.cast(idx_type)
)
.to_series(0)
)
return idxs.cast(idx_type)
elif _check_for_numpy(idxs) and isinstance(idxs, np.ndarray):
if idxs.ndim != 1:
raise ValueError("Only 1D numpy array is supported as index.")
if idxs.dtype.kind in ("i", "u"):
# Numpy array with signed or unsigned integers.
if idx_type == UInt32:
if idxs.dtype in {np.int64, np.uint64} and idxs.max() >= 2**32:
raise ValueError("Index positions should be smaller than 2^32.")
if idxs.dtype == np.int64 and idxs.min() < -(2**32):
raise ValueError(
"Index positions should be bigger than -2^32 + 1."
)
if idxs.dtype.kind == "i":
if idxs.min() < 0:
if idx_type == UInt32:
if idxs.dtype in (np.int8, np.int16):
idxs = idxs.astype(np.int32)
else:
if idxs.dtype in (np.int8, np.int16, np.int32):
idxs = idxs.astype(np.int64)
# Update negative indexes to absolute indexes.
idxs = np.where(idxs < 0, self.len() + idxs, idxs)
# Cast signed numpy array to unsigned numpy array as all indexes
# are positive and casting signed Polars Series to unsigned
# Polars series is much slower.
if isinstance(idxs, np.ndarray):
idxs = idxs.astype(
np.uint32 if idx_type == UInt32 else np.uint64
)
return Series("", idxs, dtype=idx_type)
raise NotImplementedError("Unsupported idxs datatype.")
@overload
def __getitem__(self, item: int) -> Any:
...
@overload
def __getitem__(
self,
item: Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool],
) -> Series:
...
def __getitem__(
self,
item: (
int | Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]
),
) -> Any:
if isinstance(item, Series) and item.dtype in INTEGER_DTYPES:
# Unsigned or signed Series (ordered from fastest to slowest).
# - pl.UInt32 (polars) or pl.UInt64 (polars_u64_idx) Series indexes.
# - Other unsigned Series indexes are converted to pl.UInt32 (polars)
# or pl.UInt64 (polars_u64_idx).
# - Signed Series indexes are converted pl.UInt32 (polars) or
# pl.UInt64 (polars_u64_idx) after negative indexes are converted
# to absolute indexes.
return self._from_pyseries(
self._s.take_with_series(self._pos_idxs(item)._s)
)
elif (
_check_for_numpy(item)
and isinstance(item, np.ndarray)
and item.dtype.kind in ("i", "u")
):
if item.ndim != 1:
raise ValueError("Only a 1D-Numpy array is supported as index.")
# Unsigned or signed Numpy array (ordered from fastest to slowest).
# - np.uint32 (polars) or np.uint64 (polars_u64_idx) numpy array
# indexes.
# - Other unsigned numpy array indexes are converted to pl.UInt32
# (polars) or pl.UInt64 (polars_u64_idx).
# - Signed numpy array indexes are converted pl.UInt32 (polars) or
# pl.UInt64 (polars_u64_idx) after negative indexes are converted
# to absolute indexes.
return self._from_pyseries(
self._s.take_with_series(self._pos_idxs(item)._s)
)
# Integer.
elif isinstance(item, int):
if item < 0:
item = self.len() + item
return self._s.get_idx(item)
# Slice.
elif isinstance(item, slice):
return PolarsSlice(self).apply(item)
# Range.
elif isinstance(item, range):
return self[range_to_slice(item)]
# Sequence of integers (slow to check if sequence contains all integers).
elif is_int_sequence(item):
return self._from_pyseries(
self._s.take_with_series(self._pos_idxs(Series("", item))._s)
)
raise ValueError(
f"Cannot __getitem__ on Series of dtype: '{self.dtype}' "
f"with argument: '{item}' of type: '{type(item)}'."
)
def __setitem__(
self,
key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object],
value: Any,
) -> None:
# do the single idx as first branch as those are likely in a tight loop
if isinstance(key, int) and not isinstance(key, bool):
self.set_at_idx(key, value)
return None
elif isinstance(value, Sequence) and not isinstance(value, str):
if self.is_numeric() or self.is_temporal():
self.set_at_idx(key, value) # type: ignore[arg-type]
return None
raise ValueError(
f"cannot set Series of dtype: {self.dtype} with list/tuple as value;"
" use a scalar value"
)
if isinstance(key, Series):
if key.dtype == Boolean:
self._s = self.set(key, value)._s
elif key.dtype == UInt64:
self._s = self.set_at_idx(key.cast(UInt32), value)._s
elif key.dtype == UInt32:
self._s = self.set_at_idx(key, value)._s
# TODO: implement for these types without casting to series
elif _check_for_numpy(key) and isinstance(key, np.ndarray):
if key.dtype == np.bool_:
# boolean numpy mask
self._s = self.set_at_idx(np.argwhere(key)[:, 0], value)._s
else:
s = self._from_pyseries(
PySeries.new_u32("", np.array(key, np.uint32), True)
)
self.__setitem__(s, value)
elif isinstance(key, (list, tuple)):
s = self._from_pyseries(sequence_to_pyseries("", key, dtype=UInt32))
self.__setitem__(s, value)
else:
raise ValueError(f'cannot use "{key}" for indexing')
def __array__(self, dtype: Any = None) -> np.ndarray[Any, Any]:
if dtype:
return self.to_numpy().__array__(dtype)
else:
return self.to_numpy().__array__()
def __array_ufunc__(
self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any
) -> Series:
"""Numpy universal functions."""
if self._s.n_chunks() > 1:
self._s.rechunk(in_place=True)
s = self._s
if method == "__call__":
if not ufunc.nout == 1:
raise NotImplementedError(
"Only ufuncs that return one 1D array, are supported."
)
args: list[int | float | np.ndarray[Any, Any]] = []
for arg in inputs:
if isinstance(arg, (int, float, np.ndarray)):
args.append(arg)
elif isinstance(arg, Series):
args.append(arg.view(ignore_nulls=True))
else:
raise ValueError(f"Unsupported type {type(arg)} for {arg}.")
# Get minimum dtype needed to be able to cast all input arguments to the
# same dtype.
dtype_char_minimum = np.result_type(*args).char
# Get all possible output dtypes for ufunc.
# Input dtypes and output dtypes seem to always match for ufunc.types,
# so pick all the different output dtypes.
dtypes_ufunc = [
input_output_type[-1]
for input_output_type in ufunc.types
if supported_numpy_char_code(input_output_type[-1])
]
# Get the first ufunc dtype from all possible ufunc dtypes for which
# the input arguments can be safely cast to that ufunc dtype.
for dtype_ufunc in dtypes_ufunc:
if np.can_cast(dtype_char_minimum, dtype_ufunc):
dtype_char_minimum = dtype_ufunc
break
# Override minimum dtype if requested.
dtype_char = (
np.dtype(kwargs.pop("dtype")).char
if "dtype" in kwargs
else dtype_char_minimum
)
f = get_ffi_func("apply_ufunc_<>", numpy_char_code_to_dtype(dtype_char), s)
if f is None:
raise NotImplementedError(
"Could not find "
f"`apply_ufunc_{numpy_char_code_to_dtype(dtype_char)}`."
)
series = f(lambda out: ufunc(*args, out=out, dtype=dtype_char, **kwargs))
return self._from_pyseries(series)
else:
raise NotImplementedError(
"Only `__call__` is implemented for numpy ufuncs on a Series, got"
f" `{method}`."
)
def _repr_html_(self) -> str:
"""Format output data in HTML for display in Jupyter Notebooks."""
return self.to_frame()._repr_html_(from_series=True)
def item(self, row: int | None = None) -> Any:
"""
Return the series as a scalar, or return the element at the given row index.
If no row index is provided, this is equivalent to ``s[0]``, with a check
that the shape is (1,). With a row index, this is equivalent to ``s[row]``.
Examples
--------
>>> s1 = pl.Series("a", [1])
>>> s1.item()
1
>>> s2 = pl.Series("a", [9, 8, 7])
>>> s2.cumsum().item(-1)
24
"""
if row is None and len(self) != 1:
raise ValueError(
f"Can only call '.item()' if the series is of length 1, or an "
f"explicit row index is provided (series is of length {len(self)})"
)
return self[row or 0]
def estimated_size(self, unit: SizeUnit = "b") -> int | float:
"""
Return an estimation of the total (heap) allocated size of the Series.
Estimated size is given in the specified unit (bytes by default).
This estimation is the sum of the size of its buffers, validity, including
nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
size of 2 arrays is not the sum of the sizes computed from this function. In
particular, [`StructArray`]'s size is an upper bound.
When an array is sliced, its allocated size remains constant because the buffer
unchanged. However, this function will yield a smaller number. This is because
this function returns the visible size of the buffer, not its total capacity.
FFI buffers are included in this estimation.
Parameters
----------
unit : {'b', 'kb', 'mb', 'gb', 'tb'}
Scale the returned size to the given unit.
Examples
--------
>>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32)
>>> s.estimated_size()
4000000
>>> s.estimated_size("mb")
3.814697265625
"""
sz = self._s.estimated_size()
return scale_bytes(sz, unit)
def sqrt(self) -> Series:
"""
Compute the square root of the elements.
Syntactic sugar for
>>> pl.Series([1, 2]) ** 0.5
shape: (2,)
Series: '' [f64]
[
1.0
1.414214
]
"""
def any(self) -> bool:
"""
Check if any boolean value in the column is `True`.
Returns
-------
Boolean literal
"""
return self.to_frame().select(F.col(self.name).any()).to_series()[0]
def all(self) -> bool:
"""
Check if all boolean values in the column are `True`.
Returns
-------
Boolean literal
"""
return self.to_frame().select(F.col(self.name).all()).to_series()[0]
def log(self, base: float = math.e) -> Series:
"""Compute the logarithm to a given base."""
def log1p(self) -> Series:
"""Compute the natural logarithm of the input array plus one, element-wise."""
def log10(self) -> Series:
"""Compute the base 10 logarithm of the input array, element-wise."""
def exp(self) -> Series:
"""Compute the exponential, element-wise."""
def drop_nulls(self) -> Series:
"""
Drop all null values.
Creates a new Series that copies data from this Series without null values.
"""
def drop_nans(self) -> Series:
"""Drop NaN values."""
def to_frame(self, name: str | None = None) -> DataFrame:
"""
Cast this Series to a DataFrame.
Parameters
----------
name
optionally name/rename the Series column in the new DataFrame.
Examples
--------
>>> s = pl.Series("a", [123, 456])
>>> df = s.to_frame()
>>> df
shape: (2, 1)
┌─────┐
│ a │
│ --- │
│ i64 │
╞═════╡
│ 123 │
│ 456 │
└─────┘
>>> df = s.to_frame("xyz")
>>> df
shape: (2, 1)
┌─────┐
│ xyz │
│ --- │
│ i64 │
╞═════╡
│ 123 │
│ 456 │
└─────┘
"""
if isinstance(name, str):
return wrap_df(PyDataFrame([self.rename(name)._s]))
return wrap_df(PyDataFrame([self._s]))
def describe(
self, percentiles: Sequence[float] | float | None = (0.25, 0.75)
) -> DataFrame:
"""
Quick summary statistics of a series.
Series with mixed datatypes will return summary statistics for the datatype of
the first value.
Parameters
----------
percentiles
One or more percentiles to include in the summary statistics (if the
series has a numeric dtype). All values must be in the range `[0, 1]`.
Returns
-------
Dictionary with summary statistics of a Series.
Examples
--------
>>> series_num = pl.Series([1, 2, 3, 4, 5])
>>> series_num.describe()
shape: (9, 2)
┌────────────┬──────────┐
│ statistic ┆ value │
│ --- ┆ --- │
│ str ┆ f64 │
╞════════════╪══════════╡
│ count ┆ 5.0 │
│ null_count ┆ 0.0 │
│ mean ┆ 3.0 │
│ std ┆ 1.581139 │
│ min ┆ 1.0 │
│ max ┆ 5.0 │
│ median ┆ 3.0 │
│ 25% ┆ 2.0 │
│ 75% ┆ 4.0 │
└────────────┴──────────┘
>>> series_str = pl.Series(["a", "a", None, "b", "c"])
>>> series_str.describe()
shape: (3, 2)
┌────────────┬───────┐
│ statistic ┆ value │
│ --- ┆ --- │
│ str ┆ i64 │
╞════════════╪═══════╡
│ count ┆ 5 │
│ null_count ┆ 1 │
│ unique ┆ 4 │
└────────────┴───────┘
"""
if isinstance(percentiles, float):
percentiles = [percentiles]
if percentiles and not all((0 <= p <= 1) for p in percentiles):
raise ValueError("Percentiles must all be in the range [0, 1].")
stats: dict[str, PythonLiteral | None]
if self.len() == 0:
raise ValueError("Series must contain at least one value")
elif self.is_numeric():
s = self.cast(Float64)
stats = {
"count": s.len(),
"null_count": s.null_count(),
"mean": s.mean(),
"std": s.std(),
"min": s.min(),
"max": s.max(),
"median": s.median(),
}
if percentiles:
stats.update({f"{p:.0%}": s.quantile(p) for p in percentiles})
elif self.is_boolean():
stats = {
"count": self.len(),
"null_count": self.null_count(),
"sum": self.sum(),
}
elif self.is_utf8():
stats = {
"count": self.len(),
"null_count": self.null_count(),
"unique": len(self.unique()),
}
elif self.is_temporal():
# we coerce all to string, because a polars column
# only has a single dtype and dates: datetime and count: int don't match
stats = {
"count": str(self.len()),
"null_count": str(self.null_count()),
"min": str(self.dt.min()),
"max": str(self.dt.max()),
"median": str(self.dt.median()),
}
else:
raise TypeError("This type is not supported")
return pl.DataFrame({"statistic": stats.keys(), "value": stats.values()})
def sum(self) -> int | float:
"""
Reduce this Series to the sum value.
Notes
-----
Dtypes in {Int8, UInt8, Int16, UInt16} are cast to
Int64 before summing to prevent overflow issues.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.sum()
6
"""
return self._s.sum()
def mean(self) -> int | float | None:
"""
Reduce this Series to the mean value.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.mean()
2.0
"""
return self._s.mean()
def product(self) -> int | float:
"""Reduce this Series to the product value."""
return self.to_frame().select(F.col(self.name).product()).to_series()[0]
def pow(self, exponent: int | float | Series) -> Series:
"""
Raise to the power of the given exponent.
Parameters
----------
exponent
The exponent. Accepts Series input.
Examples
--------
>>> s = pl.Series("foo", [1, 2, 3, 4])
>>> s.pow(3)
shape: (4,)
Series: 'foo' [f64]
[
1.0
8.0
27.0
64.0
]
"""
if self.is_temporal():
raise ValueError(
"first cast to integer before raising datelike dtypes to a power"
)
if _check_for_numpy(exponent) and isinstance(exponent, np.ndarray):
exponent = Series(exponent)
return self.to_frame().select(F.col(self.name).pow(exponent)).to_series()
def min(self) -> PythonLiteral | None:
"""
Get the minimal value in this Series.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.min()
1
"""
return self._s.min()
def max(self) -> PythonLiteral | None:
"""
Get the maximum value in this Series.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.max()
3
"""
return self._s.max()
def nan_max(self) -> int | float | date | datetime | timedelta | str:
"""
Get maximum value, but propagate/poison encountered NaN values.
This differs from numpy's `nanmax` as numpy defaults to propagating NaN values,
whereas polars defaults to ignoring them.
"""
return self.to_frame().select(F.col(self.name).nan_max()).item()
def nan_min(self) -> int | float | date | datetime | timedelta | str:
"""
Get minimum value, but propagate/poison encountered NaN values.
This differs from numpy's `nanmax` as numpy defaults to propagating NaN values,
whereas polars defaults to ignoring them.
"""
return self.to_frame().select(F.col(self.name).nan_min()).item()
def std(self, ddof: int = 1) -> float | None:
"""
Get the standard deviation of this Series.
Parameters
----------
ddof
“Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
where N represents the number of elements.
By default ddof is 1.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.std()
1.0
"""
if not self.is_numeric():
return None
return self.to_frame().select(F.col(self.name).std(ddof)).to_series()[0]
def var(self, ddof: int = 1) -> float | None:
"""
Get variance of this Series.
Parameters
----------
ddof
“Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
where N represents the number of elements.
By default ddof is 1.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.var()
1.0
"""
if not self.is_numeric():
return None
return self.to_frame().select(F.col(self.name).var(ddof)).to_series()[0]
def median(self) -> float | None:
"""
Get the median of this Series.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.median()
2.0
"""
return self._s.median()
def quantile(
self, quantile: float, interpolation: RollingInterpolationMethod = "nearest"
) -> float | None:
"""
Get the quantile value of this Series.
Parameters
----------
quantile
Quantile between 0.0 and 1.0.
interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'}
Interpolation method.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.quantile(0.5)
2.0
"""
return self._s.quantile(quantile, interpolation)
def to_dummies(self, separator: str = "_") -> DataFrame:
"""
Get dummy/indicator variables.
Parameters
----------
separator
Separator/delimiter used when generating column names.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.to_dummies()
shape: (3, 3)
┌─────┬─────┬─────┐
│ a_1 ┆ a_2 ┆ a_3 │
│ --- ┆ --- ┆ --- │
│ u8 ┆ u8 ┆ u8 │
╞═════╪═════╪═════╡
│ 1 ┆ 0 ┆ 0 │
│ 0 ┆ 1 ┆ 0 │
│ 0 ┆ 0 ┆ 1 │
└─────┴─────┴─────┘
"""
return wrap_df(self._s.to_dummies(separator))
def cut(
self,
bins: list[float],
labels: list[str] | None = None,
break_point_label: str = "break_point",
category_label: str = "category",
*,
maintain_order: bool = False,
) -> DataFrame:
"""
Bin values into discrete values.
Parameters
----------
bins
Bins to create.
labels
Labels to assign to the bins. If given the length of labels must be
len(bins) + 1.
break_point_label
Name given to the breakpoint column.
category_label
Name given to the category column.
maintain_order
Keep the order of the original `Series`.
Returns
-------
DataFrame
Examples
--------
>>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)])
>>> a.cut([-1, 1])
shape: (12, 3)
┌──────┬─────────────┬──────────────┐
│ a ┆ break_point ┆ category │
│ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ cat │
╞══════╪═════════════╪══════════════╡
│ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │
│ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │
│ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │
│ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │
│ … ┆ … ┆ … │
│ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │
│ 1.5 ┆ inf ┆ (1.0, inf] │
│ 2.0 ┆ inf ┆ (1.0, inf] │
│ 2.5 ┆ inf ┆ (1.0, inf] │
└──────┴─────────────┴──────────────┘
"""
return wrap_df(
self._s.cut(
Series(break_point_label, bins, dtype=Float64)._s,
labels,
break_point_label,
category_label,
maintain_order,
)
)
def qcut(
self,
quantiles: list[float],
*,
labels: list[str] | None = None,
break_point_label: str = "break_point",
category_label: str = "category",
maintain_order: bool = False,
) -> DataFrame:
"""
Bin values into discrete values based on their quantiles.
Parameters
----------
quantiles
Quaniles to create.
We expect quantiles ``0.0 <= quantile <= 1``
labels
Labels to assign to the quantiles. If given the length of labels must be
len(bins) + 1.
break_point_label
Name given to the breakpoint column.
category_label
Name given to the category column.
maintain_order
Keep the order of the original `Series`.
Returns
-------
DataFrame
Warnings
--------
This functionality is experimental and may change without it being considered a
breaking change.
Examples
--------
>>> a = pl.Series("a", range(-5, 3))
>>> a.qcut([0.0, 0.25, 0.75])
shape: (8, 3)
┌──────┬─────────────┬───────────────┐
│ a ┆ break_point ┆ category │
│ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ cat │
╞══════╪═════════════╪═══════════════╡
│ -5.0 ┆ -5.0 ┆ (-inf, -5.0] │
│ -4.0 ┆ -3.25 ┆ (-5.0, -3.25] │
│ -3.0 ┆ 0.25 ┆ (-3.25, 0.25] │
│ -2.0 ┆ 0.25 ┆ (-3.25, 0.25] │
│ -1.0 ┆ 0.25 ┆ (-3.25, 0.25] │
│ 0.0 ┆ 0.25 ┆ (-3.25, 0.25] │
│ 1.0 ┆ inf ┆ (0.25, inf] │
│ 2.0 ┆ inf ┆ (0.25, inf] │
└──────┴─────────────┴───────────────┘
"""
return wrap_df(
self._s.qcut(
Series(quantiles, dtype=Float64)._s,
labels,
break_point_label,
category_label,
maintain_order,
)
)
def hist(
self,
bins: list[float] | None = None,
*,
bin_count: int | None = None,
) -> DataFrame:
"""
Bin values into buckets and count their occurrences.
Parameters
----------
bins
Discretizations to make.
If None given, we determine the boundaries based on the data.
bin_count
If no bins provided, this will be used to determine
the distance of the bins
Returns
-------
DataFrame
Warnings
--------
This functionality is experimental and may change without it being considered a
breaking change.
Examples
--------
>>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3])
>>> a.hist(bin_count=4)
shape: (5, 3)
┌─────────────┬─────────────┬─────────┐
│ break_point ┆ category ┆ a_count │
│ --- ┆ --- ┆ --- │
│ f64 ┆ cat ┆ u32 │
╞═════════════╪═════════════╪═════════╡
│ 0.0 ┆ (-inf, 0.0] ┆ 0 │
│ 2.25 ┆ (0.0, 2.25] ┆ 3 │
│ 4.5 ┆ (2.25, 4.5] ┆ 2 │
│ 6.75 ┆ (4.5, 6.75] ┆ 0 │
│ inf ┆ (6.75, inf] ┆ 2 │
└─────────────┴─────────────┴─────────┘
"""
if bins:
bins = Series(bins, dtype=Float64)._s
return wrap_df(self._s.hist(bins, bin_count))
def value_counts(self, *, sort: bool = False) -> DataFrame:
"""
Count the unique values in a Series.
Parameters
----------
sort
Ensure the output is sorted from most values to least.
Examples
--------
>>> s = pl.Series("a", [1, 2, 2, 3])
>>> s.value_counts().sort(by="a")
shape: (3, 2)
┌─────┬────────┐
│ a ┆ counts │
│ --- ┆ --- │
│ i64 ┆ u32 │
╞═════╪════════╡
│ 1 ┆ 1 │
│ 2 ┆ 2 │
│ 3 ┆ 1 │
└─────┴────────┘
"""
return wrap_df(self._s.value_counts(sort))
def unique_counts(self) -> Series:
"""
Return a count of the unique values in the order of appearance.
Examples
--------
>>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"])
>>> s.unique_counts()
shape: (3,)
Series: 'id' [u32]
[
1
2
3
]
"""
def entropy(self, base: float = math.e, *, normalize: bool = False) -> float | None:
"""
Computes the entropy.
Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities.
Parameters
----------
base
Given base, defaults to `e`
normalize
Normalize pk if it doesn't sum to 1.
Examples
--------
>>> a = pl.Series([0.99, 0.005, 0.005])
>>> a.entropy(normalize=True)
0.06293300616044681
>>> b = pl.Series([0.65, 0.10, 0.25])
>>> b.entropy(normalize=True)
0.8568409950394724
"""
return (
self.to_frame()
.select(F.col(self.name).entropy(base, normalize=normalize))
.to_series()[0]
)
def cumulative_eval(
self, expr: Expr, min_periods: int = 1, *, parallel: bool = False
) -> Series:
"""
Run an expression over a sliding window that increases `1` slot every iteration.
Parameters
----------
expr
Expression to evaluate
min_periods
Number of valid values there should be in the window before the expression
is evaluated. valid values = `length - null_count`
parallel
Run in parallel. Don't do this in a groupby or another operation that
already has much parallelization.
Warnings
--------
This functionality is experimental and may change without it being considered a
breaking change.
This can be really slow as it can have `O(n^2)` complexity. Don't use this
for operations that visit all elements.
Examples
--------
>>> s = pl.Series("values", [1, 2, 3, 4, 5])
>>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2)
shape: (5,)
Series: 'values' [f64]
[
0.0
-3.0
-8.0
-15.0
-24.0
]
"""
def alias(self, name: str) -> Series:
"""
Return a copy of the Series with a new alias/name.
Parameters
----------
name
New name.
Examples
--------
>>> srs = pl.Series("x", [1, 2, 3])
>>> new_aliased_srs = srs.alias("y")
"""
s = self.clone()
s._s.rename(name)
return s
def rename(self, name: str, *, in_place: bool = False) -> Series:
"""
Rename this Series.
Parameters
----------
name
New name.
in_place
Modify the Series in-place.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.rename("b")
shape: (3,)
Series: 'b' [i64]
[
1
2
3
]
"""
if in_place:
self._s.rename(name)
return self
else:
return self.alias(name)
def chunk_lengths(self) -> list[int]:
"""
Get the length of each individual chunk.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s2 = pl.Series("a", [4, 5, 6])
Concatenate Series with rechunk = True
>>> pl.concat([s, s2]).chunk_lengths()
[6]
Concatenate Series with rechunk = False
>>> pl.concat([s, s2], rechunk=False).chunk_lengths()
[3, 3]
"""
return self._s.chunk_lengths()
def n_chunks(self) -> int:
"""
Get the number of chunks that this Series contains.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.n_chunks()
1
>>> s2 = pl.Series("a", [4, 5, 6])
Concatenate Series with rechunk = True
>>> pl.concat([s, s2]).n_chunks()
1
Concatenate Series with rechunk = False
>>> pl.concat([s, s2], rechunk=False).n_chunks()
2
"""
return self._s.n_chunks()
def cummax(self, *, reverse: bool = False) -> Series:
"""
Get an array with the cumulative max computed at every element.
Parameters
----------
reverse
reverse the operation.
Examples
--------
>>> s = pl.Series("s", [3, 5, 1])
>>> s.cummax()
shape: (3,)
Series: 's' [i64]
[
3
5
5
]
"""
def cummin(self, *, reverse: bool = False) -> Series:
"""
Get an array with the cumulative min computed at every element.
Parameters
----------
reverse
reverse the operation.
Examples
--------
>>> s = pl.Series("s", [1, 2, 3])
>>> s.cummin()
shape: (3,)
Series: 's' [i64]
[
1
1
1
]
"""
def cumprod(self, *, reverse: bool = False) -> Series:
"""
Get an array with the cumulative product computed at every element.
Parameters
----------
reverse
reverse the operation.
Notes
-----
Dtypes in {Int8, UInt8, Int16, UInt16} are cast to
Int64 before summing to prevent overflow issues.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.cumprod()
shape: (3,)
Series: 'a' [i64]
[
1
2
6
]
"""
def cumsum(self, *, reverse: bool = False) -> Series:
"""
Get an array with the cumulative sum computed at every element.
Parameters
----------
reverse
reverse the operation.
Notes
-----
Dtypes in {Int8, UInt8, Int16, UInt16} are cast to
Int64 before summing to prevent overflow issues.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.cumsum()
shape: (3,)
Series: 'a' [i64]
[
1
3
6
]
"""
def slice(self, offset: int, length: int | None = None) -> Series:
"""
Get a slice of this Series.
Parameters
----------
offset
Start index. Negative indexing is supported.
length
Length of the slice. If set to ``None``, all rows starting at the offset
will be selected.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3, 4])
>>> s.slice(1, 2)
shape: (2,)
Series: 'a' [i64]
[
2
3
]
"""
def append(self, other: Series, *, append_chunks: bool = True) -> Series:
"""
Append a Series to this one.
Parameters
----------
other
Series to append.
append_chunks
If set to `True` the append operation will add the chunks from `other` to
self. This is super cheap.
If set to `False` the append operation will do the same as
`DataFrame.extend` which extends the memory backed by this `Series` with
the values from `other`.
Different from `append chunks`, `extend` appends the data from `other` to
the underlying memory locations and thus may cause a reallocation (which are
expensive).
If this does not cause a reallocation, the resulting data structure will not
have any extra chunks and thus will yield faster queries.
Prefer `extend` over `append_chunks` when you want to do a query after a
single append. For instance during online operations where you add `n` rows
and rerun a query.
Prefer `append_chunks` over `extend` when you want to append many times
before doing a query. For instance when you read in multiple files and when
to store them in a single `Series`. In the latter case, finish the sequence
of `append_chunks` operations with a `rechunk`.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s2 = pl.Series("b", [4, 5, 6])
>>> s.append(s2)
shape: (6,)
Series: 'a' [i64]
[
1
2
3
4
5
6
]
"""
try:
if append_chunks:
self._s.append(other._s)
else:
self._s.extend(other._s)
except RuntimeError as exc:
if str(exc) == "Already mutably borrowed":
self.append(other.clone(), append_chunks=append_chunks)
else:
raise exc
return self
def filter(self, predicate: Series | list[bool]) -> Self:
"""
Filter elements by a boolean mask.
Parameters
----------
predicate
Boolean mask.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> mask = pl.Series("", [True, False, True])
>>> s.filter(mask)
shape: (2,)
Series: 'a' [i64]
[
1
3
]
"""
if isinstance(predicate, list):
predicate = Series("", predicate)
return self._from_pyseries(self._s.filter(predicate._s))
def head(self, n: int = 10) -> Series:
"""
Get the first `n` elements.
Parameters
----------
n
Number of elements to return. If a negative value is passed, return all
elements except the last ``abs(n)``.
See Also
--------
tail, slice
Examples
--------
>>> s = pl.Series("a", [1, 2, 3, 4, 5])
>>> s.head(3)
shape: (3,)
Series: 'a' [i64]
[
1
2
3
]
Pass a negative value to get all rows `except` the last ``abs(n)``.
>>> s.head(-3)
shape: (2,)
Series: 'a' [i64]
[
1
2
]
"""
if n < 0:
n = max(0, self.len() + n)
return self.to_frame().select(F.col(self.name).head(n)).to_series()
def tail(self, n: int = 10) -> Series:
"""
Get the last `n` elements.
Parameters
----------
n
Number of elements to return. If a negative value is passed, return all
elements except the first ``abs(n)``.
See Also
--------
head, slice
Examples
--------
>>> s = pl.Series("a", [1, 2, 3, 4, 5])
>>> s.tail(3)
shape: (3,)
Series: 'a' [i64]
[
3
4
5
]
Pass a negative value to get all rows `except` the first ``abs(n)``.
>>> s.tail(-3)
shape: (2,)
Series: 'a' [i64]
[
4
5
]
"""
if n < 0:
n = max(0, self.len() + n)
return self.to_frame().select(F.col(self.name).tail(n)).to_series()
def limit(self, n: int = 10) -> Series:
"""
Get the first `n` elements.
Alias for :func:`Series.head`.
Parameters
----------
n
Number of elements to return. If a negative value is passed, return all
elements except the last ``abs(n)``.
See Also
--------
head
"""
return self.head(n)
def take_every(self, n: int) -> Series:
"""
Take every nth value in the Series and return as new Series.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3, 4])
>>> s.take_every(2)
shape: (2,)
Series: 'a' [i64]
[
1
3
]
"""
def sort(self, *, descending: bool = False, in_place: bool = False) -> Self:
"""
Sort this Series.
Parameters
----------
descending
Sort in descending order.
in_place
Sort in-place.
Examples
--------
>>> s = pl.Series("a", [1, 3, 4, 2])
>>> s.sort()
shape: (4,)
Series: 'a' [i64]
[
1
2
3
4
]
>>> s.sort(descending=True)
shape: (4,)
Series: 'a' [i64]
[
4
3
2
1
]
"""
if in_place:
self._s = self._s.sort(descending)
return self
else:
return self._from_pyseries(self._s.sort(descending))
def top_k(self, k: int = 5) -> Series:
r"""
Return the `k` largest elements.
This has time complexity:
.. math:: O(n + k \\log{}n - \frac{k}{2})
Parameters
----------
k
Number of elements to return.
See Also
--------
bottom_k
Examples
--------
>>> s = pl.Series("a", [2, 5, 1, 4, 3])
>>> s.top_k(3)
shape: (3,)
Series: 'a' [i64]
[
5
4
3
]
"""
def bottom_k(self, k: int = 5) -> Series:
r"""
Return the `k` smallest elements.
This has time complexity:
.. math:: O(n + k \\log{}n - \frac{k}{2})
Parameters
----------
k
Number of elements to return.
See Also
--------
top_k
Examples
--------
>>> s = pl.Series("a", [2, 5, 1, 4, 3])
>>> s.bottom_k(3)
shape: (3,)
Series: 'a' [i64]
[
1
2
3
]
"""
def arg_sort(self, *, descending: bool = False, nulls_last: bool = False) -> Series:
"""
Get the index values that would sort this Series.
Parameters
----------
descending
Sort in descending order.
nulls_last
Place null values last instead of first.
Examples
--------
>>> s = pl.Series("a", [5, 3, 4, 1, 2])
>>> s.arg_sort()
shape: (5,)
Series: 'a' [u32]
[
3
4
1
2
0
]
"""
def arg_unique(self) -> Series:
"""
Get unique index as Series.
Returns
-------
Series
Examples
--------
>>> s = pl.Series("a", [1, 2, 2, 3])
>>> s.arg_unique()
shape: (3,)
Series: 'a' [u32]
[
0
1
3
]
"""
def arg_min(self) -> int | None:
"""
Get the index of the minimal value.
Returns
-------
Integer
Examples
--------
>>> s = pl.Series("a", [3, 2, 1])
>>> s.arg_min()
2
"""
return self._s.arg_min()
def arg_max(self) -> int | None:
"""
Get the index of the maximal value.
Returns
-------
Integer
Examples
--------
>>> s = pl.Series("a", [3, 2, 1])
>>> s.arg_max()
0
"""
return self._s.arg_max()
@overload
def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int:
...
@overload
def search_sorted(
self,
element: Series | np.ndarray[Any, Any] | list[int] | list[float],
side: SearchSortedSide = ...,
) -> Series:
...
def search_sorted(
self,
element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float],
side: SearchSortedSide = "any",
) -> int | Series:
"""
Find indices where elements should be inserted to maintain order.
.. math:: a[i-1] < v <= a[i]
Parameters
----------
element
Expression or scalar value.
side : {'any', 'left', 'right'}
If 'any', the index of the first suitable location found is given.
If 'left', the index of the leftmost suitable location found is given.
If 'right', return the rightmost suitable location found is given.
"""
if isinstance(element, (int, float)):
return F.select(F.lit(self).search_sorted(element, side)).item()
element = Series(element)
return F.select(F.lit(self).search_sorted(element, side)).to_series()
def unique(self, *, maintain_order: bool = False) -> Series:
"""
Get unique elements in series.
Parameters
----------
maintain_order
Maintain order of data. This requires more work.
Examples
--------
>>> s = pl.Series("a", [1, 2, 2, 3])
>>> s.unique().sort()
shape: (3,)
Series: 'a' [i64]
[
1
2
3
]
"""
def take(
self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]
) -> Series:
"""
Take values by index.
Parameters
----------
indices
Index location used for selection.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3, 4])
>>> s.take([1, 3])
shape: (2,)
Series: 'a' [i64]
[
2
4
]
"""
return self.to_frame().select(F.col(self.name).take(indices)).to_series()
def null_count(self) -> int:
"""Count the null values in this Series."""
return self._s.null_count()
def has_validity(self) -> bool:
"""
Return True if the Series has a validity bitmask.
If there is none, it means that there are no null values.
Use this to swiftly assert a Series does not have null values.
"""
return self._s.has_validity()
def is_empty(self) -> bool:
"""
Check if the Series is empty.
Examples
--------
>>> s = pl.Series("a", [], dtype=pl.Float32)
>>> s.is_empty()
True
"""
return self.len() == 0
def is_sorted(self, *, descending: bool = False) -> bool:
"""
Check if the Series is sorted.
Parameters
----------
descending
Check if the Series is sorted in descending order
"""
return self._s.is_sorted(descending)
def is_null(self) -> Series:
"""
Returns a boolean Series indicating which values are null.
Returns
-------
Boolean Series
Examples
--------
>>> s = pl.Series("a", [1.0, 2.0, 3.0, None])
>>> s.is_null()
shape: (4,)
Series: 'a' [bool]
[
false
false
false
true
]
"""
def is_not_null(self) -> Series:
"""
Returns a boolean Series indicating which values are not null.
Returns
-------
Boolean Series
Examples
--------
>>> s = pl.Series("a", [1.0, 2.0, 3.0, None])
>>> s.is_not_null()
shape: (4,)
Series: 'a' [bool]
[
true
true
true
false
]
"""
def is_finite(self) -> Series:
"""
Returns a boolean Series indicating which values are finite.
Returns
-------
Boolean Series
Examples
--------
>>> import numpy as np
>>> s = pl.Series("a", [1.0, 2.0, np.inf])
>>> s.is_finite()
shape: (3,)
Series: 'a' [bool]
[
true
true
false
]
"""
def is_infinite(self) -> Series:
"""
Returns a boolean Series indicating which values are infinite.
Returns
-------
Boolean Series
Examples
--------
>>> import numpy as np
>>> s = pl.Series("a", [1.0, 2.0, np.inf])
>>> s.is_infinite()
shape: (3,)
Series: 'a' [bool]
[
false
false
true
]
"""
def is_nan(self) -> Series:
"""
Returns a boolean Series indicating which values are not NaN.
Returns
-------
Boolean Series
Examples
--------
>>> import numpy as np
>>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN])
>>> s.is_nan()
shape: (4,)
Series: 'a' [bool]
[
false
false
false
true
]
"""
def is_not_nan(self) -> Series:
"""
Returns a boolean Series indicating which values are not NaN.
Returns
-------
Boolean Series
Examples
--------
>>> import numpy as np
>>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN])
>>> s.is_not_nan()
shape: (4,)
Series: 'a' [bool]
[
true
true
true
false
]
"""
def is_in(self, other: Series | Collection[Any]) -> Series:
"""
Check if elements of this Series are in the other Series.
Returns
-------
Boolean Series
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s2 = pl.Series("b", [2, 4])
>>> s2.is_in(s)
shape: (2,)
Series: 'b' [bool]
[
true
false
]
>>> # check if some values are a member of sublists
>>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]])
>>> optional_members = pl.Series("optional_members", [1, 2, 3])
>>> print(sets)
shape: (3,)
Series: 'sets' [list[i64]]
[
[1, 2, 3]
[1, 2]
[9, 10]
]
>>> print(optional_members)
shape: (3,)
Series: 'optional_members' [i64]
[
1
2
3
]
>>> optional_members.is_in(sets)
shape: (3,)
Series: 'optional_members' [bool]
[
true
true
false
]
"""
def arg_true(self) -> Series:
"""
Get index values where Boolean Series evaluate True.
Returns
-------
UInt32 Series
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> (s == 2).arg_true()
shape: (1,)
Series: 'a' [u32]
[
1
]
"""
return F.arg_where(self, eager=True)
def is_unique(self) -> Series:
"""
Get mask of all unique values.
Returns
-------
Boolean Series
Examples
--------
>>> s = pl.Series("a", [1, 2, 2, 3])
>>> s.is_unique()
shape: (4,)
Series: 'a' [bool]
[
true
false
false
true
]
"""
def is_first(self) -> Series:
"""
Get a mask of the first unique value.
Returns
-------
Boolean Series
"""
def is_duplicated(self) -> Series:
"""
Get mask of all duplicated values.
Returns
-------
Boolean Series
Examples
--------
>>> s = pl.Series("a", [1, 2, 2, 3])
>>> s.is_duplicated()
shape: (4,)
Series: 'a' [bool]
[
false
true
true
false
]
"""
def explode(self) -> Series:
"""
Explode a list or utf8 Series.
This means that every item is expanded to a new row.
Returns
-------
Exploded Series of same dtype
See Also
--------
ListNameSpace.explode : Explode a list column.
StringNameSpace.explode : Explode a string column.
"""
def series_equal(
self, other: Series, *, null_equal: bool = True, strict: bool = False
) -> bool:
"""
Check if series is equal with another Series.
Parameters
----------
other
Series to compare with.
null_equal
Consider null values as equal.
strict
Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a
`pl.Int64` will return `False`.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s2 = pl.Series("b", [4, 5, 6])
>>> s.series_equal(s)
True
>>> s.series_equal(s2)
False
"""
return self._s.series_equal(other._s, null_equal, strict)
def len(self) -> int:
"""
Length of this Series.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.len()
3
"""
return self._s.len()
def cast(
self,
dtype: (PolarsDataType | type[int] | type[float] | type[str] | type[bool]),
*,
strict: bool = True,
) -> Self:
"""
Cast between data types.
Parameters
----------
dtype
DataType to cast to.
strict
Throw an error if a cast could not be done for instance due to an overflow.
Examples
--------
>>> s = pl.Series("a", [True, False, True])
>>> s
shape: (3,)
Series: 'a' [bool]
[
true
false
true
]
>>> s.cast(pl.UInt32)
shape: (3,)
Series: 'a' [u32]
[
1
0
1
]
"""
# Do not dispatch cast as it is expensive and used in other functions.
dtype = py_type_to_dtype(dtype)
return self._from_pyseries(self._s.cast(dtype, strict))
def to_physical(self) -> Series:
"""
Cast to physical representation of the logical dtype.
- :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32`
- :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64`
- :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64`
- :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64`
- :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32`
- Other data types will be left unchanged.
Examples
--------
Replicating the pandas
`pd.Series.factorize
<https://pandas.pydata.org/docs/reference/api/pandas.Series.factorize.html>`_
method.
>>> s = pl.Series("values", ["a", None, "x", "a"])
>>> s.cast(pl.Categorical).to_physical()
shape: (4,)
Series: 'values' [u32]
[
0
null
1
0
]
"""
def to_list(self, *, use_pyarrow: bool = False) -> list[Any]:
"""
Convert this Series to a Python List. This operation clones data.
Parameters
----------
use_pyarrow
Use pyarrow for the conversion.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.to_list()
[1, 2, 3]
>>> type(s.to_list())
<class 'list'>
"""
if use_pyarrow:
return self.to_arrow().to_pylist()
return self._s.to_list()
def rechunk(self, *, in_place: bool = False) -> Self:
"""
Create a single chunk of memory for this Series.
Parameters
----------
in_place
In place or not.
"""
opt_s = self._s.rechunk(in_place)
return self if in_place else self._from_pyseries(opt_s)
def reverse(self) -> Series:
"""
Return Series in reverse order.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8)
>>> s.reverse()
shape: (3,)
Series: 'a' [i8]
[
3
2
1
]
"""
def is_between(
self,
lower_bound: IntoExpr,
upper_bound: IntoExpr,
closed: ClosedInterval = "both",
) -> Series:
"""
Get a boolean mask of the values that fall between the given start/end values.
Parameters
----------
lower_bound
Lower bound value. Accepts expression input. Non-expression inputs
(including strings) are parsed as literals.
upper_bound
Upper bound value. Accepts expression input. Non-expression inputs
(including strings) are parsed as literals.
closed : {'both', 'left', 'right', 'none'}
Define which sides of the interval are closed (inclusive).
Examples
--------
>>> s = pl.Series("num", [1, 2, 3, 4, 5])
>>> s.is_between(2, 4)
shape: (5,)
Series: 'num' [bool]
[
false
true
true
true
false
]
Use the ``closed`` argument to include or exclude the values at the bounds:
>>> s.is_between(2, 4, closed="left")
shape: (5,)
Series: 'num' [bool]
[
false
true
true
false
false
]
You can also use strings as well as numeric/temporal values:
>>> s = pl.Series("s", ["a", "b", "c", "d", "e"])
>>> s.is_between("b", "d", closed="both")
shape: (5,)
Series: 's' [bool]
[
false
true
true
true
false
]
"""
if isinstance(lower_bound, str):
lower_bound = F.lit(lower_bound)
if isinstance(upper_bound, str):
upper_bound = F.lit(upper_bound)
return (
self.to_frame()
.select(F.col(self.name).is_between(lower_bound, upper_bound, closed))
.to_series()
)
def is_numeric(self) -> bool:
"""
Check if this Series datatype is numeric.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.is_numeric()
True
"""
return self.dtype in NUMERIC_DTYPES
def is_integer(self, signed: bool | None = None) -> bool:
"""
Check if this Series datatype is an integer (signed or unsigned).
Parameters
----------
signed
* if `None`, both signed and unsigned integer dtypes will match.
* if `True`, only signed integer dtypes will be considered a match.
* if `False`, only unsigned integer dtypes will be considered a match.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32)
>>> s.is_integer()
True
>>> s.is_integer(signed=False)
True
>>> s.is_integer(signed=True)
False
"""
if signed is None:
return self.dtype in INTEGER_DTYPES
elif signed is True:
return self.dtype in SIGNED_INTEGER_DTYPES
elif signed is False:
return self.dtype in UNSIGNED_INTEGER_DTYPES
raise ValueError(f"'signed' must be None, True or False; given {signed!r}")
def is_temporal(self, excluding: OneOrMoreDataTypes | None = None) -> bool:
"""
Check if this Series datatype is temporal.
Parameters
----------
excluding
Optionally exclude one or more temporal dtypes from matching.
Examples
--------
>>> from datetime import date
>>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)])
>>> s.is_temporal()
True
>>> s.is_temporal(excluding=[pl.Date])
False
"""
if excluding is not None:
if not isinstance(excluding, Iterable):
excluding = [excluding]
if self.dtype in excluding:
return False
return self.dtype in TEMPORAL_DTYPES
def is_float(self) -> bool:
"""
Check if this Series has floating point numbers.
Examples
--------
>>> s = pl.Series("a", [1.0, 2.0, 3.0])
>>> s.is_float()
True
"""
return self.dtype in FLOAT_DTYPES
def is_boolean(self) -> bool:
"""
Check if this Series is a Boolean.
Examples
--------
>>> s = pl.Series("a", [True, False, True])
>>> s.is_boolean()
True
"""
return self.dtype is Boolean
def is_utf8(self) -> bool:
"""
Check if this Series datatype is a Utf8.
Examples
--------
>>> s = pl.Series("x", ["a", "b", "c"])
>>> s.is_utf8()
True
"""
return self.dtype is Utf8
def view(self, *, ignore_nulls: bool = False) -> SeriesView:
"""
Get a view into this Series data with a numpy array.
This operation doesn't clone data, but does not include missing values.
Don't use this unless you know what you are doing.
Parameters
----------
ignore_nulls
If True then nulls are converted to 0.
If False then an Exception is raised if nulls are present.
Examples
--------
>>> s = pl.Series("a", [1, None])
>>> s.view(ignore_nulls=True)
SeriesView([1, 0])
"""
if not ignore_nulls:
assert not self.has_validity()
from polars.series._numpy import SeriesView, _ptr_to_numpy
ptr_type = dtype_to_ctype(self.dtype)
ptr = self._s.as_single_ptr()
array = _ptr_to_numpy(ptr, self.len(), ptr_type)
array.setflags(write=False)
return SeriesView(array, self)
def to_numpy(
self,
*args: Any,
zero_copy_only: bool = False,
writable: bool = False,
use_pyarrow: bool = True,
) -> np.ndarray[Any, Any]:
"""
Convert this Series to numpy. This operation clones data but is completely safe.
If you want a zero-copy view and know what you are doing, use `.view()`.
Parameters
----------
*args
args will be sent to pyarrow.Array.to_numpy.
zero_copy_only
If True, an exception will be raised if the conversion to a numpy
array would require copying the underlying data (e.g. in presence
of nulls, or for non-primitive types).
writable
For numpy arrays created with zero copy (view on the Arrow data),
the resulting array is not writable (Arrow data is immutable).
By setting this to True, a copy of the array is made to ensure
it is writable.
use_pyarrow
Use pyarrow for the conversion to numpy.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> arr = s.to_numpy()
>>> arr # doctest: +IGNORE_RESULT
array([1, 2, 3], dtype=int64)
>>> type(arr)
<class 'numpy.ndarray'>
"""
def convert_to_date(arr: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
if self.dtype == Date:
tp = "datetime64[D]"
elif self.dtype == Duration:
tp = f"timedelta64[{self.time_unit}]"
else:
tp = f"datetime64[{self.time_unit}]"
return arr.astype(tp)
def raise_no_zero_copy() -> None:
if zero_copy_only:
raise ValueError("Cannot return a zero-copy array")
if (
use_pyarrow
and _PYARROW_AVAILABLE
and self.dtype != Object
and not self.is_temporal(excluding=Time)
):
return self.to_arrow().to_numpy(
*args, zero_copy_only=zero_copy_only, writable=writable
)
elif self.dtype == Time:
raise_no_zero_copy()
# note: there is no native numpy "time" dtype
return np.array(self.to_list(), dtype="object")
else:
if not self.has_validity():
if self.is_temporal():
np_array = convert_to_date(self.view(ignore_nulls=True))
elif self.is_numeric():
np_array = self.view(ignore_nulls=True)
else:
raise_no_zero_copy()
np_array = self._s.to_numpy()
elif self.is_temporal():
np_array = convert_to_date(self.to_physical()._s.to_numpy())
else:
raise_no_zero_copy()
np_array = self._s.to_numpy()
if writable and not np_array.flags.writeable:
raise_no_zero_copy()
return np_array.copy()
else:
return np_array
def to_arrow(self) -> pa.Array:
"""
Get the underlying Arrow Array.
If the Series contains only a single chunk this operation is zero copy.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s = s.to_arrow()
>>> s # doctest: +ELLIPSIS
<pyarrow.lib.Int64Array object at ...>
[
1,
2,
3
]
"""
return self._s.to_arrow()
def to_pandas( # noqa: D417
self, *args: Any, use_pyarrow_extension_array: bool = False, **kwargs: Any
) -> pd.Series:
"""
Convert this Series to a pandas Series.
This requires that :mod:`pandas` and :mod:`pyarrow` are installed.
This operation clones data, unless `use_pyarrow_extension_array=True`.
Parameters
----------
use_pyarrow_extension_array
Further operations on this Pandas series, might trigger conversion to numpy.
Use PyArrow backed-extension array instead of numpy array for pandas
Series. This allows zero copy operations and preservation of nulls
values.
Further operations on this pandas Series, might trigger conversion
to NumPy arrays if that operation is not supported by pyarrow compute
functions.
kwargs
Arguments will be sent to :meth:`pyarrow.Table.to_pandas`.
Examples
--------
>>> s1 = pl.Series("a", [1, 2, 3])
>>> s1.to_pandas()
0 1
1 2
2 3
Name: a, dtype: int64
>>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP
0 1
1 2
2 3
Name: a, dtype: int64[pyarrow]
>>> s2 = pl.Series("b", [1, 2, None, 4])
>>> s2.to_pandas()
0 1.0
1 2.0
2 NaN
3 4.0
Name: b, dtype: float64
>>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP
0 1
1 2
2 <NA>
3 4
Name: b, dtype: int64[pyarrow]
"""
if use_pyarrow_extension_array:
if parse_version(pd.__version__) < parse_version("1.5"):
raise ModuleNotFoundError(
f'pandas>=1.5.0 is required for `to_pandas("use_pyarrow_extension_array=True")`, found Pandas {pd.__version__}.'
)
if not _PYARROW_AVAILABLE or parse_version(pa.__version__) < parse_version(
"8"
):
raise ModuleNotFoundError(
f'pyarrow>=8.0.0 is required for `to_pandas("use_pyarrow_extension_array=True")`'
f", found pyarrow {pa.__version__}."
if _PYARROW_AVAILABLE
else "."
)
pd_series = (
self.to_arrow().to_pandas(
self_destruct=True,
split_blocks=True,
types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype),
**kwargs,
)
if use_pyarrow_extension_array
else self.to_arrow().to_pandas(**kwargs)
)
pd_series.name = self.name
return pd_series
def to_init_repr(self, n: int = 1000) -> str:
"""
Convert Series to instantiatable string representation.
Parameters
----------
n
Only use first n elements.
See Also
--------
polars.Series.to_init_repr
polars.from_repr
Examples
--------
>>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16)
>>> print(s.to_init_repr())
pl.Series("a", [1, 2, None, 4], dtype=pl.Int16)
>>> s_from_str_repr = eval(s.to_init_repr())
>>> s_from_str_repr
shape: (4,)
Series: 'a' [i16]
[
1
2
null
4
]
"""
return (
f'pl.Series("{self.name}", {self.head(n).to_list()}, dtype=pl.{self.dtype})'
)
def set(self, filter: Series, value: int | float | str) -> Series:
"""
Set masked values.
Parameters
----------
filter
Boolean mask.
value
Value with which to replace the masked values.
Notes
-----
Use of this function is frequently an anti-pattern, as it can
block optimisation (predicate pushdown, etc). Consider using
`pl.when(predicate).then(value).otherwise(self)` instead.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.set(s == 2, 10)
shape: (3,)
Series: 'a' [i64]
[
1
10
3
]
It is better to implement this as follows:
>>> s.to_frame().select(
... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a"))
... )
shape: (3, 1)
┌─────────┐
│ literal │
│ --- │
│ i64 │
╞═════════╡
│ 1 │
│ 10 │
│ 3 │
└─────────┘
"""
f = get_ffi_func("set_with_mask_<>", self.dtype, self._s)
if f is None:
return NotImplemented
return self._from_pyseries(f(filter._s, value))
def set_at_idx(
self,
idx: Series | np.ndarray[Any, Any] | Sequence[int] | int,
value: (
int
| float
| str
| bool
| Sequence[int]
| Sequence[float]
| Sequence[bool]
| Sequence[str]
| Sequence[date]
| Sequence[datetime]
| date
| datetime
| Series
| None
),
) -> Series:
"""
Set values at the index locations.
Parameters
----------
idx
Integers representing the index locations.
value
replacement values.
Returns
-------
the series mutated
Notes
-----
Use of this function is frequently an anti-pattern, as it can
block optimisation (predicate pushdown, etc). Consider using
`pl.when(predicate).then(value).otherwise(self)` instead.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.set_at_idx(1, 10)
shape: (3,)
Series: 'a' [i64]
[
1
10
3
]
It is better to implement this as follows:
>>> s.to_frame().with_row_count("row_nr").select(
... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a"))
... )
shape: (3, 1)
┌─────────┐
│ literal │
│ --- │
│ i64 │
╞═════════╡
│ 1 │
│ 10 │
│ 3 │
└─────────┘
"""
if isinstance(idx, int):
idx = [idx]
if len(idx) == 0:
return self
idx = Series("", idx)
if isinstance(value, (int, float, bool, str)) or (value is None):
value = Series("", [value])
# if we need to set more than a single value, we extend it
if len(idx) > 0:
value = value.extend_constant(value[0], len(idx) - 1)
elif not isinstance(value, Series):
value = Series("", value)
self._s.set_at_idx(idx._s, value._s)
return self
def clear(self, n: int = 0) -> Series:
"""
Create an empty copy of the current Series, with zero to 'n' elements.
The copy has an identical name/dtype, but no data.
Parameters
----------
n
Number of (empty) elements to return in the cleared frame.
See Also
--------
clone : Cheap deepcopy/clone.
Examples
--------
>>> s = pl.Series("a", [None, True, False])
>>> s.clear()
shape: (0,)
Series: 'a' [bool]
[
]
>>> s.clear(n=2)
shape: (2,)
Series: 'a' [bool]
[
null
null
]
"""
if n == 0:
return self._from_pyseries(self._s.clear())
s = (
self.__class__(name=self.name, values=[], dtype=self.dtype)
if len(self) > 0
else self.clone()
)
return s.extend_constant(None, n=n) if n > 0 else s
def clone(self) -> Self:
"""
Very cheap deepcopy/clone.
See Also
--------
clear : Create an empty copy of the current Series, with identical
schema but no data.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.clone()
shape: (3,)
Series: 'a' [i64]
[
1
2
3
]
"""
return self._from_pyseries(self._s.clone())
def fill_nan(self, value: int | float | Expr | None) -> Series:
"""
Fill floating point NaN value with a fill value.
Parameters
----------
value
Value used to fill NaN values.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3, float("nan")])
>>> s.fill_nan(0)
shape: (4,)
Series: 'a' [f64]
[
1.0
2.0
3.0
0.0
]
"""
def fill_null(
self,
value: Any | None = None,
strategy: FillNullStrategy | None = None,
limit: int | None = None,
) -> Series:
"""
Fill null values using the specified value or strategy.
Parameters
----------
value
Value used to fill null values.
strategy : {None, 'forward', 'backward', 'min', 'max', 'mean', 'zero', 'one'}
Strategy used to fill null values.
limit
Number of consecutive null values to fill when using the 'forward' or
'backward' strategy.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3, None])
>>> s.fill_null(strategy="forward")
shape: (4,)
Series: 'a' [i64]
[
1
2
3
3
]
>>> s.fill_null(strategy="min")
shape: (4,)
Series: 'a' [i64]
[
1
2
3
1
]
>>> s = pl.Series("b", ["x", None, "z"])
>>> s.fill_null(pl.lit(""))
shape: (3,)
Series: 'b' [str]
[
"x"
""
"z"
]
"""
def floor(self) -> Series:
"""
Rounds down to the nearest integer value.
Only works on floating point Series.
Examples
--------
>>> s = pl.Series("a", [1.12345, 2.56789, 3.901234])
>>> s.floor()
shape: (3,)
Series: 'a' [f64]
[
1.0
2.0
3.0
]
"""
def ceil(self) -> Series:
"""
Rounds up to the nearest integer value.
Only works on floating point Series.
Examples
--------
>>> s = pl.Series("a", [1.12345, 2.56789, 3.901234])
>>> s.ceil()
shape: (3,)
Series: 'a' [f64]
[
2.0
3.0
4.0
]
"""
def round(self, decimals: int = 0) -> Series:
"""
Round underlying floating point data by `decimals` digits.
Examples
--------
>>> s = pl.Series("a", [1.12345, 2.56789, 3.901234])
>>> s.round(2)
shape: (3,)
Series: 'a' [f64]
[
1.12
2.57
3.9
]
Parameters
----------
decimals
number of decimals to round by.
"""
def dot(self, other: Series | ArrayLike) -> float | None:
"""
Compute the dot/inner product between two Series.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s2 = pl.Series("b", [4.0, 5.0, 6.0])
>>> s.dot(s2)
32.0
Parameters
----------
other
Series (or array) to compute dot product with.
"""
if not isinstance(other, Series):
other = Series(other)
if len(self) != len(other):
n, m = len(self), len(other)
raise ShapeError(f"Series length mismatch: expected {n}, found {m}")
return self._s.dot(other._s)
def mode(self) -> Series:
"""
Compute the most occurring value(s).
Can return multiple Values.
Examples
--------
>>> s = pl.Series("a", [1, 2, 2, 3])
>>> s.mode()
shape: (1,)
Series: 'a' [i64]
[
2
]
"""
def sign(self) -> Series:
"""
Compute the element-wise indication of the sign.
The returned values can be -1, 0, or 1:
* -1 if x < 0.
* 0 if x == 0.
* 1 if x > 0.
(null values are preserved as-is).
Examples
--------
>>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None])
>>> s.sign()
shape: (5,)
Series: 'a' [i64]
[
-1
0
0
1
null
]
"""
def sin(self) -> Series:
"""
Compute the element-wise value for the sine.
Examples
--------
>>> import math
>>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi])
>>> s.sin()
shape: (3,)
Series: 'a' [f64]
[
0.0
1.0
1.2246e-16
]
"""
def cos(self) -> Series:
"""
Compute the element-wise value for the cosine.
Examples
--------
>>> import math
>>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi])
>>> s.cos()
shape: (3,)
Series: 'a' [f64]
[
1.0
6.1232e-17
-1.0
]
"""
def tan(self) -> Series:
"""
Compute the element-wise value for the tangent.
Examples
--------
>>> import math
>>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi])
>>> s.tan()
shape: (3,)
Series: 'a' [f64]
[
0.0
1.6331e16
-1.2246e-16
]
"""
def arcsin(self) -> Series:
"""
Compute the element-wise value for the inverse sine.
Examples
--------
>>> s = pl.Series("a", [1.0, 0.0, -1.0])
>>> s.arcsin()
shape: (3,)
Series: 'a' [f64]
[
1.570796
0.0
-1.570796
]
"""
def arccos(self) -> Series:
"""
Compute the element-wise value for the inverse cosine.
Examples
--------
>>> s = pl.Series("a", [1.0, 0.0, -1.0])
>>> s.arccos()
shape: (3,)
Series: 'a' [f64]
[
0.0
1.570796
3.141593
]
"""
def arctan(self) -> Series:
"""
Compute the element-wise value for the inverse tangent.
Examples
--------
>>> s = pl.Series("a", [1.0, 0.0, -1.0])
>>> s.arctan()
shape: (3,)
Series: 'a' [f64]
[
0.785398
0.0
-0.785398
]
"""
def arcsinh(self) -> Series:
"""
Compute the element-wise value for the inverse hyperbolic sine.
Examples
--------
>>> s = pl.Series("a", [1.0, 0.0, -1.0])
>>> s.arcsinh()
shape: (3,)
Series: 'a' [f64]
[
0.881374
0.0
-0.881374
]
"""
def arccosh(self) -> Series:
"""
Compute the element-wise value for the inverse hyperbolic cosine.
Examples
--------
>>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0])
>>> s.arccosh()
shape: (4,)
Series: 'a' [f64]
[
2.292432
0.0
NaN
NaN
]
"""
def arctanh(self) -> Series:
"""
Compute the element-wise value for the inverse hyperbolic tangent.
Examples
--------
>>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1])
>>> s.arctanh()
shape: (7,)
Series: 'a' [f64]
[
NaN
inf
0.549306
0.0
-0.549306
-inf
NaN
]
"""
def sinh(self) -> Series:
"""
Compute the element-wise value for the hyperbolic sine.
Examples
--------
>>> s = pl.Series("a", [1.0, 0.0, -1.0])
>>> s.sinh()
shape: (3,)
Series: 'a' [f64]
[
1.175201
0.0
-1.175201
]
"""
def cosh(self) -> Series:
"""
Compute the element-wise value for the hyperbolic cosine.
Examples
--------
>>> s = pl.Series("a", [1.0, 0.0, -1.0])
>>> s.cosh()
shape: (3,)
Series: 'a' [f64]
[
1.543081
1.0
1.543081
]
"""
def tanh(self) -> Series:
"""
Compute the element-wise value for the hyperbolic tangent.
Examples
--------
>>> s = pl.Series("a", [1.0, 0.0, -1.0])
>>> s.tanh()
shape: (3,)
Series: 'a' [f64]
[
0.761594
0.0
-0.761594
]
"""
def apply(
self,
function: Callable[[Any], Any],
return_dtype: PolarsDataType | None = None,
*,
skip_nulls: bool = True,
) -> Self:
"""
Apply a custom/user-defined function (UDF) over elements in this Series.
If the function returns a different datatype, the return_dtype arg should
be set, otherwise the method will fail.
Implementing logic using a Python function is almost always _significantly_
slower and more memory intensive than implementing the same logic using
the native expression API because:
- The native expression engine runs in Rust; UDFs run in Python.
- Use of Python UDFs forces the DataFrame to be materialized in memory.
- Polars-native expressions can be parallelised (UDFs typically cannot).
- Polars-native expressions can be logically optimised (UDFs cannot).
Wherever possible you should strongly prefer the native expression API
to achieve the best performance.
Parameters
----------
function
Custom function or lambda.
return_dtype
Output datatype. If none is given, the same datatype as this Series will be
used.
skip_nulls
Nulls will be skipped and not passed to the python function.
This is faster because python can be skipped and because we call
more specialized functions.
Notes
-----
If your function is expensive and you don't want it to be called more than
once for a given input, consider applying an ``@lru_cache`` decorator to it.
With suitable data you may achieve order-of-magnitude speedups (or more).
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.apply(lambda x: x + 10)
shape: (3,)
Series: 'a' [i64]
[
11
12
13
]
Returns
-------
Series
"""
if return_dtype is None:
pl_return_dtype = None
else:
pl_return_dtype = py_type_to_dtype(return_dtype)
return self._from_pyseries(
self._s.apply_lambda(function, pl_return_dtype, skip_nulls)
)
def shift(self, periods: int = 1) -> Series:
"""
Shift the values by a given period.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.shift(periods=1)
shape: (3,)
Series: 'a' [i64]
[
null
1
2
]
>>> s.shift(periods=-1)
shape: (3,)
Series: 'a' [i64]
[
2
3
null
]
Parameters
----------
periods
Number of places to shift (may be negative).
"""
def shift_and_fill(
self,
fill_value: int | Expr,
*,
periods: int = 1,
) -> Series:
"""
Shift the values by a given period and fill the resulting null values.
Parameters
----------
fill_value
Fill None values with the result of this expression.
periods
Number of places to shift (may be negative).
"""
def zip_with(self, mask: Series, other: Series) -> Self:
"""
Take values from self or other based on the given mask.
Where mask evaluates true, take values from self. Where mask evaluates false,
take values from other.
Parameters
----------
mask
Boolean Series.
other
Series of same type.
Returns
-------
New Series
Examples
--------
>>> s1 = pl.Series([1, 2, 3, 4, 5])
>>> s2 = pl.Series([5, 4, 3, 2, 1])
>>> s1.zip_with(s1 < s2, s2)
shape: (5,)
Series: '' [i64]
[
1
2
3
2
1
]
>>> mask = pl.Series([True, False, True, False, True])
>>> s1.zip_with(mask, s2)
shape: (5,)
Series: '' [i64]
[
1
4
3
2
5
]
"""
return self._from_pyseries(self._s.zip_with(mask._s, other._s))
def rolling_min(
self,
window_size: int,
weights: list[float] | None = None,
min_periods: int | None = None,
*,
center: bool = False,
) -> Series:
"""
Apply a rolling min (moving min) over the values in this array.
A window of length `window_size` will traverse the array. The values that fill
this window will (optionally) be multiplied with the weights given by the
`weight` vector. The resulting values will be aggregated to their sum.
Parameters
----------
window_size
The length of the window.
weights
An optional slice with the same length as the window that will be multiplied
elementwise with the values in the window.
min_periods
The number of values in the window that should be non-null before computing
a result. If None, it will be set equal to window size.
center
Set the labels at the center of the window
Examples
--------
>>> s = pl.Series("a", [100, 200, 300, 400, 500])
>>> s.rolling_min(window_size=3)
shape: (5,)
Series: 'a' [i64]
[
null
null
100
200
300
]
"""
return (
self.to_frame()
.select(
F.col(self.name).rolling_min(
window_size, weights, min_periods, center=center
)
)
.to_series()
)
def rolling_max(
self,
window_size: int,
weights: list[float] | None = None,
min_periods: int | None = None,
*,
center: bool = False,
) -> Series:
"""
Apply a rolling max (moving max) over the values in this array.
A window of length `window_size` will traverse the array. The values that fill
this window will (optionally) be multiplied with the weights given by the
`weight` vector. The resulting values will be aggregated to their sum.
Parameters
----------
window_size
The length of the window.
weights
An optional slice with the same length as the window that will be multiplied
elementwise with the values in the window.
min_periods
The number of values in the window that should be non-null before computing
a result. If None, it will be set equal to window size.
center
Set the labels at the center of the window
Examples
--------
>>> s = pl.Series("a", [100, 200, 300, 400, 500])
>>> s.rolling_max(window_size=2)
shape: (5,)
Series: 'a' [i64]
[
null
200
300
400
500
]
"""
return (
self.to_frame()
.select(
F.col(self.name).rolling_max(
window_size, weights, min_periods, center=center
)
)
.to_series()
)
def rolling_mean(
self,
window_size: int,
weights: list[float] | None = None,
min_periods: int | None = None,
*,
center: bool = False,
) -> Series:
"""
Apply a rolling mean (moving mean) over the values in this array.
A window of length `window_size` will traverse the array. The values that fill
this window will (optionally) be multiplied with the weights given by the
`weight` vector. The resulting values will be aggregated to their sum.
Parameters
----------
window_size
The length of the window.
weights
An optional slice with the same length as the window that will be multiplied
elementwise with the values in the window.
min_periods
The number of values in the window that should be non-null before computing
a result. If None, it will be set equal to window size.
center
Set the labels at the center of the window
Examples
--------
>>> s = pl.Series("a", [100, 200, 300, 400, 500])
>>> s.rolling_mean(window_size=2)
shape: (5,)
Series: 'a' [f64]
[
null
150.0
250.0
350.0
450.0
]
"""
return (
self.to_frame()
.select(
F.col(self.name).rolling_mean(
window_size, weights, min_periods, center=center
)
)
.to_series()
)
def rolling_sum(
self,
window_size: int,
weights: list[float] | None = None,
min_periods: int | None = None,
*,
center: bool = False,
) -> Series:
"""
Apply a rolling sum (moving sum) over the values in this array.
A window of length `window_size` will traverse the array. The values that fill
this window will (optionally) be multiplied with the weights given by the
`weight` vector. The resulting values will be aggregated to their sum.
Parameters
----------
window_size
The length of the window.
weights
An optional slice with the same length of the window that will be multiplied
elementwise with the values in the window.
min_periods
The number of values in the window that should be non-null before computing
a result. If None, it will be set equal to window size.
center
Set the labels at the center of the window
Examples
--------
>>> s = pl.Series("a", [1, 2, 3, 4, 5])
>>> s.rolling_sum(window_size=2)
shape: (5,)
Series: 'a' [i64]
[
null
3
5
7
9
]
"""
return (
self.to_frame()
.select(
F.col(self.name).rolling_sum(
window_size, weights, min_periods, center=center
)
)
.to_series()
)
def rolling_std(
self,
window_size: int,
weights: list[float] | None = None,
min_periods: int | None = None,
*,
center: bool = False,
) -> Series:
"""
Compute a rolling std dev.
A window of length `window_size` will traverse the array. The values that fill
this window will (optionally) be multiplied with the weights given by the
`weight` vector. The resulting values will be aggregated to their sum.
Parameters
----------
window_size
The length of the window.
weights
An optional slice with the same length as the window that will be multiplied
elementwise with the values in the window.
min_periods
The number of values in the window that should be non-null before computing
a result. If None, it will be set equal to window size.
center
Set the labels at the center of the window
Examples
--------
>>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0])
>>> s.rolling_std(window_size=3)
shape: (6,)
Series: 'a' [f64]
[
null
null
1.0
1.0
1.527525
2.0
]
"""
return (
self.to_frame()
.select(
F.col(self.name).rolling_std(
window_size, weights, min_periods, center=center
)
)
.to_series()
)
def rolling_var(
self,
window_size: int,
weights: list[float] | None = None,
min_periods: int | None = None,
*,
center: bool = False,
) -> Series:
"""
Compute a rolling variance.
A window of length `window_size` will traverse the array. The values that fill
this window will (optionally) be multiplied with the weights given by the
`weight` vector. The resulting values will be aggregated to their sum.
Parameters
----------
window_size
The length of the window.
weights
An optional slice with the same length as the window that will be multiplied
elementwise with the values in the window.
min_periods
The number of values in the window that should be non-null before computing
a result. If None, it will be set equal to window size.
center
Set the labels at the center of the window
Examples
--------
>>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0])
>>> s.rolling_var(window_size=3)
shape: (6,)
Series: 'a' [f64]
[
null
null
1.0
1.0
2.333333
4.0
]
"""
return (
self.to_frame()
.select(
F.col(self.name).rolling_var(
window_size, weights, min_periods, center=center
)
)
.to_series()
)
def rolling_apply(
self,
function: Callable[[Series], Any],
window_size: int,
weights: list[float] | None = None,
min_periods: int | None = None,
*,
center: bool = False,
) -> Series:
"""
Apply a custom rolling window function.
Prefer the specific rolling window functions over this one, as they are faster:
* rolling_min
* rolling_max
* rolling_mean
* rolling_sum
Parameters
----------
function
Aggregation function
window_size
The length of the window.
weights
An optional slice with the same length as the window that will be multiplied
elementwise with the values in the window.
min_periods
The number of values in the window that should be non-null before computing
a result. If None, it will be set equal to window size.
center
Set the labels at the center of the window
Examples
--------
>>> import numpy as np
>>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0])
>>> print(s.rolling_apply(function=np.nanstd, window_size=3))
shape: (5,)
Series: 'A' [f64]
[
null
null
3.858612
3.5
0.5
]
"""
def rolling_median(
self,
window_size: int,
weights: list[float] | None = None,
min_periods: int | None = None,
*,
center: bool = False,
) -> Series:
"""
Compute a rolling median.
Parameters
----------
window_size
The length of the window.
weights
An optional slice with the same length as the window that will be multiplied
elementwise with the values in the window.
min_periods
The number of values in the window that should be non-null before computing
a result. If None, it will be set equal to window size.
center
Set the labels at the center of the window
Examples
--------
>>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0])
>>> s.rolling_median(window_size=3)
shape: (6,)
Series: 'a' [f64]
[
null
null
2.0
3.0
4.0
6.0
]
"""
if min_periods is None:
min_periods = window_size
return (
self.to_frame()
.select(
F.col(self.name).rolling_median(
window_size, weights, min_periods, center=center
)
)
.to_series()
)
def rolling_quantile(
self,
quantile: float,
interpolation: RollingInterpolationMethod = "nearest",
window_size: int = 2,
weights: list[float] | None = None,
min_periods: int | None = None,
*,
center: bool = False,
) -> Series:
"""
Compute a rolling quantile.
Parameters
----------
quantile
Quantile between 0.0 and 1.0.
interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'}
Interpolation method.
window_size
The length of the window.
weights
An optional slice with the same length as the window that will be multiplied
elementwise with the values in the window.
min_periods
The number of values in the window that should be non-null before computing
a result. If None, it will be set equal to window size.
center
Set the labels at the center of the window
Examples
--------
>>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0])
>>> s.rolling_quantile(quantile=0.33, window_size=3)
shape: (6,)
Series: 'a' [f64]
[
null
null
1.0
2.0
3.0
4.0
]
>>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3)
shape: (6,)
Series: 'a' [f64]
[
null
null
1.66
2.66
3.66
5.32
]
"""
if min_periods is None:
min_periods = window_size
return (
self.to_frame()
.select(
F.col(self.name).rolling_quantile(
quantile,
interpolation,
window_size,
weights,
min_periods,
center=center,
)
)
.to_series()
)
def rolling_skew(self, window_size: int, *, bias: bool = True) -> Series:
"""
Compute a rolling skew.
Parameters
----------
window_size
Integer size of the rolling window.
bias
If False, the calculations are corrected for statistical bias.
Examples
--------
>>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0])
>>> s.rolling_skew(window_size=3)
shape: (6,)
Series: 'a' [f64]
[
null
null
0.0
0.0
0.381802
0.0
]
"""
@deprecated_alias(frac="fraction")
def sample(
self,
n: int | None = None,
*,
fraction: float | None = None,
with_replacement: bool = False,
shuffle: bool = False,
seed: int | None = None,
) -> Series:
"""
Sample from this Series.
Parameters
----------
n
Number of items to return. Cannot be used with `fraction`. Defaults to 1 if
`fraction` is None.
fraction
Fraction of items to return. Cannot be used with `n`.
with_replacement
Allow values to be sampled more than once.
shuffle
Shuffle the order of sampled data points.
seed
Seed for the random number generator. If set to None (default), a random
seed is generated using the ``random`` module.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3, 4, 5])
>>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT
shape: (2,)
Series: 'a' [i64]
[
1
5
]
"""
return (
self.to_frame()
.select(
F.col(self.name).sample(
n,
fraction=fraction,
with_replacement=with_replacement,
shuffle=shuffle,
seed=seed,
)
)
.to_series()
)
def peak_max(self) -> Self:
"""
Get a boolean mask of the local maximum peaks.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3, 4, 5])
>>> s.peak_max()
shape: (5,)
Series: '' [bool]
[
false
false
false
false
true
]
"""
return self._from_pyseries(self._s.peak_max())
def peak_min(self) -> Self:
"""
Get a boolean mask of the local minimum peaks.
Examples
--------
>>> s = pl.Series("a", [4, 1, 3, 2, 5])
>>> s.peak_min()
shape: (5,)
Series: '' [bool]
[
false
true
false
true
false
]
"""
return self._from_pyseries(self._s.peak_min())
def n_unique(self) -> int:
"""
Count the number of unique values in this Series.
Examples
--------
>>> s = pl.Series("a", [1, 2, 2, 3])
>>> s.n_unique()
3
"""
return self._s.n_unique()
def shrink_to_fit(self, *, in_place: bool = False) -> Series:
"""
Shrink Series memory usage.
Shrinks the underlying array capacity to exactly fit the actual data.
(Note that this function does not change the Series data type).
"""
if in_place:
self._s.shrink_to_fit()
return self
else:
series = self.clone()
series._s.shrink_to_fit()
return series
def hash(
self,
seed: int = 0,
seed_1: int | None = None,
seed_2: int | None = None,
seed_3: int | None = None,
) -> Series:
"""
Hash the Series.
The hash value is of type `UInt64`.
Parameters
----------
seed
Random seed parameter. Defaults to 0.
seed_1
Random seed parameter. Defaults to `seed` if not set.
seed_2
Random seed parameter. Defaults to `seed` if not set.
seed_3
Random seed parameter. Defaults to `seed` if not set.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.hash(seed=42) # doctest: +IGNORE_RESULT
shape: (3,)
Series: 'a' [u64]
[
10734580197236529959
3022416320763508302
13756996518000038261
]
"""
def reinterpret(self, *, signed: bool = True) -> Series:
"""
Reinterpret the underlying bits as a signed/unsigned integer.
This operation is only allowed for 64bit integers. For lower bits integers,
you can safely use that cast operation.
Parameters
----------
signed
If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`.
"""
def interpolate(self, method: InterpolationMethod = "linear") -> Series:
"""
Interpolate intermediate values. The interpolation method is linear.
Parameters
----------
method : {'linear', 'nearest'}
Interpolation method
Examples
--------
>>> s = pl.Series("a", [1, 2, None, None, 5])
>>> s.interpolate()
shape: (5,)
Series: 'a' [i64]
[
1
2
3
4
5
]
"""
def abs(self) -> Series:
"""
Compute absolute values.
Same as `abs(series)`.
"""
def rank(
self,
method: RankMethod = "average",
*,
descending: bool = False,
seed: int | None = None,
) -> Series:
"""
Assign ranks to data, dealing with ties appropriately.
Parameters
----------
method : {'average', 'min', 'max', 'dense', 'ordinal', 'random'}
The method used to assign ranks to tied elements.
The following methods are available (default is 'average'):
- 'average' : The average of the ranks that would have been assigned to
all the tied values is assigned to each value.
- 'min' : The minimum of the ranks that would have been assigned to all
the tied values is assigned to each value. (This is also referred to
as "competition" ranking.)
- 'max' : The maximum of the ranks that would have been assigned to all
the tied values is assigned to each value.
- 'dense' : Like 'min', but the rank of the next highest element is
assigned the rank immediately after those assigned to the tied
elements.
- 'ordinal' : All values are given a distinct rank, corresponding to
the order that the values occur in the Series.
- 'random' : Like 'ordinal', but the rank for ties is not dependent
on the order that the values occur in the Series.
descending
Rank in descending order.
seed
If `method="random"`, use this as seed.
Examples
--------
The 'average' method:
>>> s = pl.Series("a", [3, 6, 1, 1, 6])
>>> s.rank()
shape: (5,)
Series: 'a' [f32]
[
3.0
4.5
1.5
1.5
4.5
]
The 'ordinal' method:
>>> s = pl.Series("a", [3, 6, 1, 1, 6])
>>> s.rank("ordinal")
shape: (5,)
Series: 'a' [u32]
[
3
4
1
2
5
]
"""
return (
self.to_frame()
.select(
F.col(self._s.name()).rank(
method=method, descending=descending, seed=seed
)
)
.to_series()
)
def diff(self, n: int = 1, null_behavior: NullBehavior = "ignore") -> Series:
"""
Calculate the n-th discrete difference.
Parameters
----------
n
Number of slots to shift.
null_behavior : {'ignore', 'drop'}
How to handle null values.
Examples
--------
>>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8)
>>> s.diff()
shape: (5,)
Series: 's' [i8]
[
null
-10
20
-5
10
]
>>> s.diff(n=2)
shape: (5,)
Series: 's' [i8]
[
null
null
10
15
5
]
>>> s.diff(n=2, null_behavior="drop")
shape: (3,)
Series: 's' [i8]
[
10
15
5
]
"""
def pct_change(self, n: int = 1) -> Series:
"""
Computes percentage change between values.
Percentage change (as fraction) between current element and most-recent
non-null element at least ``n`` period(s) before the current element.
Computes the change from the previous row by default.
Parameters
----------
n
periods to shift for forming percent change.
Examples
--------
>>> pl.Series(range(10)).pct_change()
shape: (10,)
Series: '' [f64]
[
null
inf
1.0
0.5
0.333333
0.25
0.2
0.166667
0.142857
0.125
]
>>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2)
shape: (10,)
Series: '' [f64]
[
null
null
3.0
3.0
3.0
3.0
3.0
3.0
3.0
3.0
]
"""
def skew(self, *, bias: bool = True) -> float | None:
r"""
Compute the sample skewness of a data set.
For normally distributed data, the skewness should be about zero. For
unimodal continuous distributions, a skewness value greater than zero means
that there is more weight in the right tail of the distribution. The
function `skewtest` can be used to determine if the skewness value
is close enough to zero, statistically speaking.
See scipy.stats for more information.
Parameters
----------
bias : bool, optional
If False, the calculations are corrected for statistical bias.
Notes
-----
The sample skewness is computed as the Fisher-Pearson coefficient
of skewness, i.e.
.. math:: g_1=\frac{m_3}{m_2^{3/2}}
where
.. math:: m_i=\frac{1}{N}\sum_{n=1}^N(x[n]-\bar{x})^i
is the biased sample :math:`i\texttt{th}` central moment, and
:math:`\bar{x}` is
the sample mean. If ``bias`` is False, the calculations are
corrected for bias and the value computed is the adjusted
Fisher-Pearson standardized moment coefficient, i.e.
.. math::
G_1 = \frac{k_3}{k_2^{3/2}} = \frac{\sqrt{N(N-1)}}{N-2}\frac{m_3}{m_2^{3/2}}
"""
return self._s.skew(bias)
def kurtosis(self, *, fisher: bool = True, bias: bool = True) -> float | None:
"""
Compute the kurtosis (Fisher or Pearson) of a dataset.
Kurtosis is the fourth central moment divided by the square of the
variance. If Fisher's definition is used, then 3.0 is subtracted from
the result to give 0.0 for a normal distribution.
If bias is False then the kurtosis is calculated using k statistics to
eliminate bias coming from biased moment estimators
See scipy.stats for more information
Parameters
----------
fisher : bool, optional
If True, Fisher's definition is used (normal ==> 0.0). If False,
Pearson's definition is used (normal ==> 3.0).
bias : bool, optional
If False, the calculations are corrected for statistical bias.
"""
return self._s.kurtosis(fisher, bias)
def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series:
"""
Clip (limit) the values in an array to a `min` and `max` boundary.
Only works for numerical types.
If you want to clip other dtypes, consider writing a "when, then, otherwise"
expression. See :func:`when` for more information.
Parameters
----------
lower_bound
Minimum value.
upper_bound
Maximum value.
Examples
--------
>>> s = pl.Series("foo", [-50, 5, None, 50])
>>> s.clip(1, 10)
shape: (4,)
Series: 'foo' [i64]
[
1
5
null
10
]
"""
def clip_min(self, lower_bound: int | float) -> Series:
"""
Clip (limit) the values in an array to a `min` boundary.
Only works for numerical types.
If you want to clip other dtypes, consider writing a "when, then, otherwise"
expression. See :func:`when` for more information.
Parameters
----------
lower_bound
Lower bound.
"""
def clip_max(self, upper_bound: int | float) -> Series:
"""
Clip (limit) the values in an array to a `max` boundary.
Only works for numerical types.
If you want to clip other dtypes, consider writing a "when, then, otherwise"
expression. See :func:`when` for more information.
Parameters
----------
upper_bound
Upper bound.
"""
def lower_bound(self) -> Self:
"""
Return the lower bound of this Series' dtype as a unit Series.
See Also
--------
upper_bound : return the upper bound of the given Series' dtype.
Examples
--------
>>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32)
>>> s.lower_bound()
shape: (1,)
Series: 's' [i32]
[
-2147483648
]
>>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32)
>>> s.lower_bound()
shape: (1,)
Series: 's' [f32]
[
-inf
]
"""
def upper_bound(self) -> Self:
"""
Return the upper bound of this Series' dtype as a unit Series.
See Also
--------
lower_bound : return the lower bound of the given Series' dtype.
Examples
--------
>>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8)
>>> s.upper_bound()
shape: (1,)
Series: 's' [i8]
[
127
]
>>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64)
>>> s.upper_bound()
shape: (1,)
Series: 's' [f64]
[
inf
]
"""
def map_dict(
self,
remapping: dict[Any, Any],
*,
default: Any = None,
return_dtype: PolarsDataType | None = None,
) -> Self:
"""
Replace values in the Series using a remapping dictionary.
Parameters
----------
remapping
Dictionary containing the before/after values to map.
default
Value to use when the remapping dict does not contain the lookup value.
Use ``pl.first()``, to keep the original value.
return_dtype
Set return dtype to override automatic return dtype determination.
Examples
--------
>>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"])
>>> country_lookup = {
... "JPN": "Japan",
... "TUR": "Türkiye",
... "NLD": "Netherlands",
... }
Remap, setting a default for unrecognised values...
>>> s.map_dict(country_lookup, default="Unspecified").rename("country_name")
shape: (4,)
Series: 'country_name' [str]
[
"Türkiye"
"Unspecified"
"Japan"
"Netherlands"
]
...or keep the original value, by making use of ``pl.first()``:
>>> s.map_dict(country_lookup, default=pl.first()).rename("country_name")
shape: (4,)
Series: 'country_name' [str]
[
"Türkiye"
"???"
"Japan"
"Netherlands"
]
...or keep the original value, by assigning the input series:
>>> s.map_dict(country_lookup, default=s).rename("country_name")
shape: (4,)
Series: 'country_name' [str]
[
"Türkiye"
"???"
"Japan"
"Netherlands"
]
Override return dtype:
>>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8)
>>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16)
shape: (3,)
Series: 'int8' [i16]
[
5
7
3
]
"""
def reshape(self, dimensions: tuple[int, ...]) -> Series:
"""
Reshape this Series to a flat Series or a Series of Lists.
Parameters
----------
dimensions
Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that
dimension is inferred.
Returns
-------
Series
If a single dimension is given, results in a flat Series of shape (len,).
If a multiple dimensions are given, results in a Series of Lists with shape
(rows, cols).
See Also
--------
ListNameSpace.explode : Explode a list column.
Examples
--------
>>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9])
>>> s.reshape((3, 3))
shape: (3,)
Series: 'foo' [list[i64]]
[
[1, 2, 3]
[4, 5, 6]
[7, 8, 9]
]
"""
def shuffle(self, seed: int | None = None) -> Series:
"""
Shuffle the contents of this Series.
Parameters
----------
seed
Seed for the random number generator. If set to None (default), a random
seed is generated using the ``random`` module.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.shuffle(seed=1)
shape: (3,)
Series: 'a' [i64]
[
2
1
3
]
"""
def ewm_mean(
self,
com: float | None = None,
span: float | None = None,
half_life: float | None = None,
alpha: float | None = None,
*,
adjust: bool = True,
min_periods: int = 1,
ignore_nulls: bool = True,
) -> Series:
r"""
Exponentially-weighted moving average.
Parameters
----------
com
Specify decay in terms of center of mass, :math:`\gamma`, with
.. math::
\alpha = \frac{1}{1 + \gamma} \; \forall \; \gamma \geq 0
span
Specify decay in terms of span, :math:`\theta`, with
.. math::
\alpha = \frac{2}{\theta + 1} \; \forall \; \theta \geq 1
half_life
Specify decay in terms of half-life, :math:`\lambda`, with
.. math::
\alpha = 1 - \exp \left\{ \frac{ -\ln(2) }{ \lambda } \right\} \;
\forall \; \lambda > 0
alpha
Specify smoothing factor alpha directly, :math:`0 < \alpha \leq 1`.
adjust
Divide by decaying adjustment factor in beginning periods to account for
imbalance in relative weightings
- When ``adjust=True`` the EW function is calculated
using weights :math:`w_i = (1 - \alpha)^i`
- When ``adjust=False`` the EW function is calculated
recursively by
.. math::
y_0 &= x_0 \\
y_t &= (1 - \alpha)y_{t - 1} + \alpha x_t
min_periods
Minimum number of observations in window required to have a value
(otherwise result is null).
ignore_nulls
Ignore missing values when calculating weights.
- When ``ignore_nulls=False`` (default), weights are based on absolute
positions.
For example, the weights of :math:`x_0` and :math:`x_2` used in
calculating the final weighted average of
[:math:`x_0`, None, :math:`x_2`] are
:math:`(1-\alpha)^2` and :math:`1` if ``adjust=True``, and
:math:`(1-\alpha)^2` and :math:`\alpha` if ``adjust=False``.
- When ``ignore_nulls=True``, weights are based
on relative positions. For example, the weights of
:math:`x_0` and :math:`x_2` used in calculating the final weighted
average of [:math:`x_0`, None, :math:`x_2`] are
:math:`1-\alpha` and :math:`1` if ``adjust=True``,
and :math:`1-\alpha` and :math:`\alpha` if ``adjust=False``.
"""
def ewm_std(
self,
com: float | None = None,
span: float | None = None,
half_life: float | None = None,
alpha: float | None = None,
*,
adjust: bool = True,
bias: bool = False,
min_periods: int = 1,
ignore_nulls: bool = True,
) -> Series:
r"""
Exponentially-weighted moving standard deviation.
Parameters
----------
com
Specify decay in terms of center of mass, :math:`\gamma`, with
.. math::
\alpha = \frac{1}{1 + \gamma} \; \forall \; \gamma \geq 0
span
Specify decay in terms of span, :math:`\theta`, with
.. math::
\alpha = \frac{2}{\theta + 1} \; \forall \; \theta \geq 1
half_life
Specify decay in terms of half-life, :math:`\lambda`, with
.. math::
\alpha = 1 - \exp \left\{ \frac{ -\ln(2) }{ \lambda } \right\} \;
\forall \; \lambda > 0
alpha
Specify smoothing factor alpha directly, :math:`0 < \alpha \leq 1`.
adjust
Divide by decaying adjustment factor in beginning periods to account for
imbalance in relative weightings
- When ``adjust=True`` the EW function is calculated
using weights :math:`w_i = (1 - \alpha)^i`
- When ``adjust=False`` the EW function is calculated
recursively by
.. math::
y_0 &= x_0 \\
y_t &= (1 - \alpha)y_{t - 1} + \alpha x_t
bias
When ``bias=False``, apply a correction to make the estimate statistically
unbiased.
min_periods
Minimum number of observations in window required to have a value
(otherwise result is null).
ignore_nulls
Ignore missing values when calculating weights.
- When ``ignore_nulls=False`` (default), weights are based on absolute
positions.
For example, the weights of :math:`x_0` and :math:`x_2` used in
calculating the final weighted average of
[:math:`x_0`, None, :math:`x_2`] are
:math:`(1-\alpha)^2` and :math:`1` if ``adjust=True``, and
:math:`(1-\alpha)^2` and :math:`\alpha` if ``adjust=False``.
- When ``ignore_nulls=True``, weights are based
on relative positions. For example, the weights of
:math:`x_0` and :math:`x_2` used in calculating the final weighted
average of [:math:`x_0`, None, :math:`x_2`] are
:math:`1-\alpha` and :math:`1` if ``adjust=True``,
and :math:`1-\alpha` and :math:`\alpha` if ``adjust=False``.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.ewm_std(com=1)
shape: (3,)
Series: 'a' [f64]
[
0.0
0.707107
0.963624
]
"""
def ewm_var(
self,
com: float | None = None,
span: float | None = None,
half_life: float | None = None,
alpha: float | None = None,
*,
adjust: bool = True,
bias: bool = False,
min_periods: int = 1,
ignore_nulls: bool = True,
) -> Series:
r"""
Exponentially-weighted moving variance.
Parameters
----------
com
Specify decay in terms of center of mass, :math:`\gamma`, with
.. math::
\alpha = \frac{1}{1 + \gamma} \; \forall \; \gamma \geq 0
span
Specify decay in terms of span, :math:`\theta`, with
.. math::
\alpha = \frac{2}{\theta + 1} \; \forall \; \theta \geq 1
half_life
Specify decay in terms of half-life, :math:`\lambda`, with
.. math::
\alpha = 1 - \exp \left\{ \frac{ -\ln(2) }{ \lambda } \right\} \;
\forall \; \lambda > 0
alpha
Specify smoothing factor alpha directly, :math:`0 < \alpha \leq 1`.
adjust
Divide by decaying adjustment factor in beginning periods to account for
imbalance in relative weightings
- When ``adjust=True`` the EW function is calculated
using weights :math:`w_i = (1 - \alpha)^i`
- When ``adjust=False`` the EW function is calculated
recursively by
.. math::
y_0 &= x_0 \\
y_t &= (1 - \alpha)y_{t - 1} + \alpha x_t
bias
When ``bias=False``, apply a correction to make the estimate statistically
unbiased.
min_periods
Minimum number of observations in window required to have a value
(otherwise result is null).
ignore_nulls
Ignore missing values when calculating weights.
- When ``ignore_nulls=False`` (default), weights are based on absolute
positions.
For example, the weights of :math:`x_0` and :math:`x_2` used in
calculating the final weighted average of
[:math:`x_0`, None, :math:`x_2`] are
:math:`(1-\alpha)^2` and :math:`1` if ``adjust=True``, and
:math:`(1-\alpha)^2` and :math:`\alpha` if ``adjust=False``.
- When ``ignore_nulls=True``, weights are based
on relative positions. For example, the weights of
:math:`x_0` and :math:`x_2` used in calculating the final weighted
average of [:math:`x_0`, None, :math:`x_2`] are
:math:`1-\alpha` and :math:`1` if ``adjust=True``,
and :math:`1-\alpha` and :math:`\alpha` if ``adjust=False``.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.ewm_var(com=1)
shape: (3,)
Series: 'a' [f64]
[
0.0
0.5
0.928571
]
"""
def extend_constant(self, value: PythonLiteral | None, n: int) -> Series:
"""
Extremely fast method for extending the Series with 'n' copies of a value.
Parameters
----------
value
A constant literal value (not an expression) with which to extend
the Series; can pass None to extend with nulls.
n
The number of additional values that will be added.
Examples
--------
>>> s = pl.Series([1, 2, 3])
>>> s.extend_constant(99, n=2)
shape: (5,)
Series: '' [i64]
[
1
2
3
99
99
]
"""
def set_sorted(self, *, descending: bool = False) -> Self:
"""
Flags the Series as 'sorted'.
Enables downstream code to user fast paths for sorted arrays.
Parameters
----------
descending
If the `Series` order is descending.
Warnings
--------
This can lead to incorrect results if this `Series` is not sorted!!
Use with care!
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.set_sorted().max()
3
"""
return self._from_pyseries(self._s.set_sorted_flag(descending))
def new_from_index(self, index: int, length: int) -> Self:
"""Create a new Series filled with values from the given index."""
return self._from_pyseries(self._s.new_from_index(index, length))
def shrink_dtype(self) -> Series:
"""
Shrink numeric columns to the minimal required datatype.
Shrink to the dtype needed to fit the extrema of this [`Series`].
This can be used to reduce memory pressure.
"""
def get_chunks(self) -> list[Series]:
"""Get the chunks of this Series as a list of Series."""
return self._s.get_chunks()
def implode(self) -> Self:
"""Aggregate values into a list."""
# Below are the namespaces defined. Do not move these up in the definition of
# Series, as it confuses mypy between the type annotation `str` and the
# namespace `str`
@property
def arr(self) -> ListNameSpace:
"""Create an object namespace of all list related methods."""
return ListNameSpace(self)
@property
def bin(self) -> BinaryNameSpace:
"""Create an object namespace of all binary related methods."""
return BinaryNameSpace(self)
@property
def cat(self) -> CatNameSpace:
"""Create an object namespace of all categorical related methods."""
return CatNameSpace(self)
@property
def dt(self) -> DateTimeNameSpace:
"""Create an object namespace of all datetime related methods."""
return DateTimeNameSpace(self)
@property
def str(self) -> StringNameSpace:
"""Create an object namespace of all string related methods."""
return StringNameSpace(self)
@property
def struct(self) -> StructNameSpace:
"""Create an object namespace of all struct related methods."""
return StructNameSpace(self)
def _resolve_datetime_dtype(
dtype: PolarsDataType | None, ndtype: np.datetime64
) -> PolarsDataType | None:
"""Given polars/numpy datetime dtypes, resolve to an explicit unit."""
if dtype is None or (dtype == Datetime and not getattr(dtype, "time_unit", None)):
time_unit = getattr(dtype, "time_unit", None) or np.datetime_data(ndtype)[0]
# explicit formulation is verbose, but keeps mypy happy
# (and avoids unsupported timeunits such as "s")
if time_unit == "ns":
dtype = Datetime("ns")
elif time_unit == "us":
dtype = Datetime("us")
elif time_unit == "ms":
dtype = Datetime("ms")
return dtype