Source code for polars.datatypes.classes

from __future__ import annotations

import contextlib
from collections import OrderedDict
from collections.abc import Mapping
from datetime import timezone
from inspect import isclass
from typing import TYPE_CHECKING, Any

import polars._reexport as pl
import polars.datatypes
import polars.functions as F

with contextlib.suppress(ImportError):  # Module not available when building docs
    from polars.polars import dtype_str_repr as _dtype_str_repr

if TYPE_CHECKING:
    from collections.abc import Iterable, Iterator, Sequence

    from polars import Series
    from polars._typing import (
        CategoricalOrdering,
        PolarsDataType,
        PythonDataType,
        SchemaDict,
        TimeUnit,
    )


class classinstmethod(classmethod):  # type: ignore[type-arg]
    """Decorator that allows a method to be called from the class OR instance."""

    def __get__(self, instance: Any, type_: type) -> Any:  # type: ignore[override]
        get = super().__get__ if instance is None else self.__func__.__get__
        return get(instance, type_)


class DataTypeClass(type):
    """Metaclass for nicely printing DataType classes."""

    def __repr__(cls) -> str:
        return cls.__name__

    def _string_repr(cls) -> str:
        return _dtype_str_repr(cls)

    # Methods below defined here in signature only to satisfy mypy

    @classmethod
    def base_type(cls) -> DataTypeClass:  # noqa: D102
        ...

    @classmethod
    def is_(cls, other: PolarsDataType) -> bool:  # noqa: D102
        ...

    @classmethod
    def is_numeric(cls) -> bool:  # noqa: D102
        ...

    @classmethod
    def is_decimal(cls) -> bool:  # noqa: D102
        ...

    @classmethod
    def is_integer(cls) -> bool:  # noqa: D102
        ...

    @classmethod
    def is_signed_integer(cls) -> bool:  # noqa: D102
        ...

    @classmethod
    def is_unsigned_integer(cls) -> bool:  # noqa: D102
        ...

    @classmethod
    def is_float(cls) -> bool:  # noqa: D102
        ...

    @classmethod
    def is_temporal(cls) -> bool:  # noqa: D102
        ...

    @classmethod
    def is_nested(cls) -> bool:  # noqa: D102
        ...

    @classmethod
    def from_python(cls, py_type: PythonDataType) -> PolarsDataType:  # noqa: D102
        ...

    @classmethod
    def to_python(self) -> PythonDataType:  # noqa: D102
        ...


class DataType(metaclass=DataTypeClass):
    """Base class for all Polars data types."""

    def _string_repr(self) -> str:
        return _dtype_str_repr(self)

    def __eq__(self, other: PolarsDataType) -> bool:  # type: ignore[override]
        if type(other) is DataTypeClass:
            return issubclass(other, type(self))
        else:
            return isinstance(other, type(self))

    def __hash__(self) -> int:
        return hash(self.__class__)

    def __repr__(self) -> str:
        return self.__class__.__name__

    @classmethod
    def base_type(cls) -> DataTypeClass:
        """
        Return this DataType's fundamental/root type class.

        Examples
        --------
        >>> pl.Datetime("ns").base_type()
        Datetime
        >>> pl.List(pl.Int32).base_type()
        List
        >>> pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)]).base_type()
        Struct
        """
        return cls

    @classinstmethod  # type: ignore[arg-type]
    def is_(self, other: PolarsDataType) -> bool:
        """
        Check if this DataType is the same as another DataType.

        This is a stricter check than `self == other`, as it enforces an exact
        match of all dtype attributes for nested and/or uninitialised dtypes.

        Parameters
        ----------
        other
            the other polars dtype to compare with.

        Examples
        --------
        >>> pl.List == pl.List(pl.Int32)
        True
        >>> pl.List.is_(pl.List(pl.Int32))
        False
        """
        return self == other and hash(self) == hash(other)

    @classmethod
    def is_numeric(cls) -> bool:
        """Check whether the data type is a numeric type."""
        return issubclass(cls, NumericType)

    @classmethod
    def is_decimal(cls) -> bool:
        """Check whether the data type is a decimal type."""
        return issubclass(cls, Decimal)

    @classmethod
    def is_integer(cls) -> bool:
        """Check whether the data type is an integer type."""
        return issubclass(cls, IntegerType)

    @classmethod
    def is_signed_integer(cls) -> bool:
        """Check whether the data type is a signed integer type."""
        return issubclass(cls, SignedIntegerType)

    @classmethod
    def is_unsigned_integer(cls) -> bool:
        """Check whether the data type is an unsigned integer type."""
        return issubclass(cls, UnsignedIntegerType)

    @classmethod
    def is_float(cls) -> bool:
        """Check whether the data type is a floating point type."""
        return issubclass(cls, FloatType)

    @classmethod
    def is_temporal(cls) -> bool:
        """Check whether the data type is a temporal type."""
        return issubclass(cls, TemporalType)

    @classmethod
    def is_nested(cls) -> bool:
        """Check whether the data type is a nested type."""
        return issubclass(cls, NestedType)

    @classmethod
    def from_python(cls, py_type: PythonDataType) -> PolarsDataType:
        """
        Return the Polars data type corresponding to a given Python type.

        Notes
        -----
        Not every Python type has a corresponding Polars data type; in general
        you should declare Polars data types explicitly to exactly specify
        the desired type and its properties (such as scale/unit).

        Examples
        --------
        >>> pl.DataType.from_python(int)
        Int64
        >>> pl.DataType.from_python(float)
        Float64
        >>> from datetime import tzinfo
        >>> pl.DataType.from_python(tzinfo)  # doctest: +SKIP
        TypeError: cannot parse input <class 'datetime.tzinfo'> into Polars data type
        """
        from polars.datatypes._parse import parse_into_dtype

        return parse_into_dtype(py_type)

    @classinstmethod  # type: ignore[arg-type]
    def to_python(self) -> PythonDataType:
        """
        Return the Python type corresponding to this Polars data type.

        Examples
        --------
        >>> pl.Int16().to_python()
        <class 'int'>
        >>> pl.Float32().to_python()
        <class 'float'>
        >>> pl.Array(pl.Date(), 10).to_python()
        <class 'list'>
        """
        from polars.datatypes import dtype_to_py_type

        return dtype_to_py_type(self)


class NumericType(DataType):
    """Base class for numeric data types."""


class IntegerType(NumericType):
    """Base class for integer data types."""


class SignedIntegerType(IntegerType):
    """Base class for signed integer data types."""


class UnsignedIntegerType(IntegerType):
    """Base class for unsigned integer data types."""


class FloatType(NumericType):
    """Base class for float data types."""


class TemporalType(DataType):
    """Base class for temporal data types."""


class NestedType(DataType):
    """Base class for nested data types."""


[docs] class Int8(SignedIntegerType): """8-bit signed integer type."""
[docs] class Int16(SignedIntegerType): """16-bit signed integer type."""
[docs] class Int32(SignedIntegerType): """32-bit signed integer type."""
[docs] class Int64(SignedIntegerType): """64-bit signed integer type."""
[docs] class UInt8(UnsignedIntegerType): """8-bit unsigned integer type."""
[docs] class UInt16(UnsignedIntegerType): """16-bit unsigned integer type."""
[docs] class UInt32(UnsignedIntegerType): """32-bit unsigned integer type."""
[docs] class UInt64(UnsignedIntegerType): """64-bit unsigned integer type."""
[docs] class Float32(FloatType): """32-bit floating point type."""
[docs] class Float64(FloatType): """64-bit floating point type."""
class Decimal(NumericType): """ Decimal 128-bit type with an optional precision and non-negative scale. .. warning:: This functionality is considered **unstable**. It is a work-in-progress feature and may not always work as expected. It may be changed at any point without it being considered a breaking change. Parameters ---------- precision Maximum number of digits in each number. If set to `None` (default), the precision is inferred. scale Number of digits to the right of the decimal point in each number. """ precision: int | None scale: int def __init__( self, precision: int | None = None, scale: int = 0, ) -> None: # Issuing the warning on `__init__` does not trigger when the class is used # without being instantiated, but it's better than nothing from polars._utils.unstable import issue_unstable_warning issue_unstable_warning( "The Decimal data type is considered unstable." " It is a work-in-progress feature and may not always work as expected." ) self.precision = precision self.scale = scale def __repr__(self) -> str: return ( f"{self.__class__.__name__}(precision={self.precision}, scale={self.scale})" ) def __eq__(self, other: PolarsDataType) -> bool: # type: ignore[override] # allow comparing object instances to class if type(other) is DataTypeClass and issubclass(other, Decimal): return True elif isinstance(other, Decimal): return self.precision == other.precision and self.scale == other.scale else: return False def __hash__(self) -> int: return hash((self.__class__, self.precision, self.scale))
[docs] class Boolean(DataType): """Boolean type."""
class String(DataType): """UTF-8 encoded string type.""" # Allow Utf8 as an alias for String Utf8 = String class Binary(DataType): """Binary type."""
[docs] class Date(TemporalType): """ Data type representing a calendar date. Notes ----- The underlying representation of this type is a 32-bit signed integer. The integer indicates the number of days since the Unix epoch (1970-01-01). The number can be negative to indicate dates before the epoch. """
class Time(TemporalType): """ Data type representing the time of day. Notes ----- The underlying representation of this type is a 64-bit signed integer. The integer indicates the number of nanoseconds since midnight. """
[docs] class Datetime(TemporalType): """ Data type representing a calendar date and time of day. Parameters ---------- time_unit : {'us', 'ns', 'ms'} Unit of time. Defaults to `'us'` (microseconds). time_zone Time zone string, as defined in zoneinfo (to see valid strings run `import zoneinfo; zoneinfo.available_timezones()` for a full list). When used to match dtypes, can set this to "*" to check for Datetime columns that have any (non-null) timezone. Notes ----- The underlying representation of this type is a 64-bit signed integer. The integer indicates the number of time units since the Unix epoch (1970-01-01 00:00:00). The number can be negative to indicate datetimes before the epoch. """ time_unit: TimeUnit time_zone: str | None def __init__( self, time_unit: TimeUnit = "us", time_zone: str | timezone | None = None ) -> None: if time_unit not in ("ms", "us", "ns"): msg = ( "invalid `time_unit`" f"\n\nExpected one of {{'ns','us','ms'}}, got {time_unit!r}." ) raise ValueError(msg) if isinstance(time_zone, timezone): time_zone = str(time_zone) self.time_unit = time_unit self.time_zone = time_zone def __eq__(self, other: PolarsDataType) -> bool: # type: ignore[override] # allow comparing object instances to class if type(other) is DataTypeClass and issubclass(other, Datetime): return True elif isinstance(other, Datetime): return ( self.time_unit == other.time_unit and self.time_zone == other.time_zone ) else: return False def __hash__(self) -> int: return hash((self.__class__, self.time_unit, self.time_zone)) def __repr__(self) -> str: class_name = self.__class__.__name__ return ( f"{class_name}(time_unit={self.time_unit!r}, time_zone={self.time_zone!r})" )
class Duration(TemporalType): """ Data type representing a time duration. Parameters ---------- time_unit : {'us', 'ns', 'ms'} Unit of time. Defaults to `'us'` (microseconds). Notes ----- The underlying representation of this type is a 64-bit signed integer. The integer indicates an amount of time units and can be negative to indicate negative time offsets. """ time_unit: TimeUnit def __init__(self, time_unit: TimeUnit = "us") -> None: if time_unit not in ("ms", "us", "ns"): msg = ( "invalid `time_unit`" f"\n\nExpected one of {{'ns','us','ms'}}, got {time_unit!r}." ) raise ValueError(msg) self.time_unit = time_unit def __eq__(self, other: PolarsDataType) -> bool: # type: ignore[override] # allow comparing object instances to class if type(other) is DataTypeClass and issubclass(other, Duration): return True elif isinstance(other, Duration): return self.time_unit == other.time_unit else: return False def __hash__(self) -> int: return hash((self.__class__, self.time_unit)) def __repr__(self) -> str: class_name = self.__class__.__name__ return f"{class_name}(time_unit={self.time_unit!r})" class Categorical(DataType): """ A categorical encoding of a set of strings. Parameters ---------- ordering : {'lexical', 'physical'} Ordering by order of appearance (`'physical'`, default) or string value (`'lexical'`). """ ordering: CategoricalOrdering | None def __init__( self, ordering: CategoricalOrdering | None = "physical", ) -> None: self.ordering = ordering def __repr__(self) -> str: return f"{self.__class__.__name__}(ordering={self.ordering!r})" def __eq__(self, other: PolarsDataType) -> bool: # type: ignore[override] # allow comparing object instances to class if type(other) is DataTypeClass and issubclass(other, Categorical): return True elif isinstance(other, Categorical): return self.ordering == other.ordering else: return False def __hash__(self) -> int: return hash((self.__class__, self.ordering)) class Enum(DataType): """ A fixed set categorical encoding of a set of strings. .. warning:: This functionality is considered **unstable**. It is a work-in-progress feature and may not always work as expected. It may be changed at any point without it being considered a breaking change. Parameters ---------- categories The categories in the dataset. Categories must be strings. """ categories: Series def __init__(self, categories: Series | Iterable[str]) -> None: # Issuing the warning on `__init__` does not trigger when the class is used # without being instantiated, but it's better than nothing from polars._utils.unstable import issue_unstable_warning issue_unstable_warning( "The Enum data type is considered unstable." " It is a work-in-progress feature and may not always work as expected." ) if not isinstance(categories, pl.Series): categories = pl.Series(values=categories) if categories.is_empty(): self.categories = pl.Series(name="category", dtype=String) return if categories.has_nulls(): msg = "Enum categories must not contain null values" raise TypeError(msg) if (dtype := categories.dtype) != String: msg = f"Enum categories must be strings; found data of type {dtype}" raise TypeError(msg) if categories.n_unique() != categories.len(): duplicate = categories.filter(categories.is_duplicated())[0] msg = f"Enum categories must be unique; found duplicate {duplicate!r}" raise ValueError(msg) self.categories = categories.rechunk().alias("category") def __eq__(self, other: PolarsDataType) -> bool: # type: ignore[override] # allow comparing object instances to class if type(other) is DataTypeClass and issubclass(other, Enum): return True elif isinstance(other, Enum): return self.categories.equals(other.categories) else: return False def __hash__(self) -> int: return hash((self.__class__, tuple(self.categories))) def __repr__(self) -> str: class_name = self.__class__.__name__ return f"{class_name}(categories={self.categories.to_list()!r})" def union(self, other: Enum) -> Enum: """Union of two Enums.""" return Enum( F.concat((self.categories, other.categories)).unique(maintain_order=True) ) __or__ = union
[docs] class Object(DataType): """Data type for wrapping arbitrary Python objects."""
class Null(DataType): """Data type representing null values.""" class Unknown(DataType): """Type representing DataType values that could not be determined statically."""
[docs] class List(NestedType): """ Variable length list type. Parameters ---------- inner The `DataType` of the values within each list. Examples -------- >>> df = pl.DataFrame( ... { ... "integer_lists": [[1, 2], [3, 4]], ... "float_lists": [[1.0, 2.0], [3.0, 4.0]], ... } ... ) >>> df shape: (2, 2) ┌───────────────┬─────────────┐ │ integer_lists ┆ float_lists │ │ --- ┆ --- │ │ list[i64] ┆ list[f64] │ ╞═══════════════╪═════════════╡ │ [1, 2] ┆ [1.0, 2.0] │ │ [3, 4] ┆ [3.0, 4.0] │ └───────────────┴─────────────┘ """ inner: PolarsDataType def __init__(self, inner: PolarsDataType | PythonDataType) -> None: self.inner = polars.datatypes.parse_into_dtype(inner) def __eq__(self, other: PolarsDataType) -> bool: # type: ignore[override] # This equality check allows comparison of type classes and type instances. # If a parent type is not specific about its inner type, we infer it as equal: # > list[i64] == list[i64] -> True # > list[i64] == list[f32] -> False # > list[i64] == list -> True # allow comparing object instances to class if type(other) is DataTypeClass and issubclass(other, List): return True elif isinstance(other, List): return self.inner == other.inner else: return False def __hash__(self) -> int: return hash((self.__class__, self.inner)) def __repr__(self) -> str: class_name = self.__class__.__name__ return f"{class_name}({self.inner!r})"
class Array(NestedType): """ Fixed length list type. Parameters ---------- inner The `DataType` of the values within each array. width The length of the arrays. Examples -------- >>> s = pl.Series("a", [[1, 2], [4, 3]], dtype=pl.Array(pl.Int64, 2)) >>> s shape: (2,) Series: 'a' [array[i64, 2]] [ [1, 2] [4, 3] ] """ inner: PolarsDataType size: int shape: tuple[int, ...] def __init__( self, inner: PolarsDataType | PythonDataType, shape: int | tuple[int, ...] | None = None, *, width: int | None = None, ) -> None: if width is not None: from polars._utils.deprecation import issue_deprecation_warning issue_deprecation_warning( "The `width` parameter for `Array` is deprecated. Use `shape` instead.", version="0.20.31", ) shape = width elif shape is None: msg = "Array constructor is missing the required argument `shape`" raise TypeError(msg) inner_parsed = polars.datatypes.parse_into_dtype(inner) inner_shape = inner_parsed.shape if isinstance(inner_parsed, Array) else () if isinstance(shape, int): self.inner = inner_parsed self.size = shape self.shape = (shape,) + inner_shape elif isinstance(shape, tuple): if len(shape) > 1: inner_parsed = Array(inner_parsed, shape[1:]) self.inner = inner_parsed self.size = shape[0] self.shape = shape + inner_shape else: msg = f"invalid input for shape: {shape!r}" raise TypeError(msg) def __eq__(self, other: PolarsDataType) -> bool: # type: ignore[override] # This equality check allows comparison of type classes and type instances. # If a parent type is not specific about its inner type, we infer it as equal: # > array[i64] == array[i64] -> True # > array[i64] == array[f32] -> False # > array[i64] == array -> True # allow comparing object instances to class if type(other) is DataTypeClass and issubclass(other, Array): return True elif isinstance(other, Array): if self.shape != other.shape: return False else: return self.inner == other.inner else: return False def __hash__(self) -> int: return hash((self.__class__, self.inner, self.size)) def __repr__(self) -> str: # Get leaf type dtype = self.inner while isinstance(dtype, Array): dtype = dtype.inner class_name = self.__class__.__name__ return f"{class_name}({dtype!r}, shape={self.shape})" @property def width(self) -> int: """The size of the Array.""" from polars._utils.deprecation import issue_deprecation_warning issue_deprecation_warning( "The `width` attribute for `Array` is deprecated. Use `size` instead.", version="0.20.31", ) return self.size class Field: """ Definition of a single field within a `Struct` DataType. Parameters ---------- name The name of the field within its parent `Struct`. dtype The `DataType` of the field's values. """ name: str dtype: PolarsDataType def __init__(self, name: str, dtype: PolarsDataType) -> None: self.name = name self.dtype = polars.datatypes.parse_into_dtype(dtype) def __eq__(self, other: Field) -> bool: # type: ignore[override] return (self.name == other.name) & (self.dtype == other.dtype) def __hash__(self) -> int: return hash((self.name, self.dtype)) def __repr__(self) -> str: class_name = self.__class__.__name__ return f"{class_name}({self.name!r}, {self.dtype})" class Struct(NestedType): """ Struct composite type. Parameters ---------- fields The fields that make up the struct. Can be either a sequence of Field objects or a mapping of column names to data types. Examples -------- Initialize using a dictionary: >>> dtype = pl.Struct({"a": pl.Int8, "b": pl.List(pl.String)}) >>> dtype Struct({'a': Int8, 'b': List(String)}) Initialize using a list of Field objects: >>> dtype = pl.Struct([pl.Field("a", pl.Int8), pl.Field("b", pl.List(pl.String))]) >>> dtype Struct({'a': Int8, 'b': List(String)}) When initializing a Series, Polars can infer a struct data type from the data. >>> s = pl.Series([{"a": 1, "b": ["x", "y"]}, {"a": 2, "b": ["z"]}]) >>> s shape: (2,) Series: '' [struct[2]] [ {1,["x", "y"]} {2,["z"]} ] >>> s.dtype Struct({'a': Int64, 'b': List(String)}) """ fields: list[Field] def __init__(self, fields: Sequence[Field] | SchemaDict) -> None: if isinstance(fields, Mapping): self.fields = [Field(name, dtype) for name, dtype in fields.items()] else: self.fields = list(fields) def __eq__(self, other: PolarsDataType) -> bool: # type: ignore[override] # The comparison allows comparing objects to classes, and specific # inner types to those without (eg: inner=None). if one of the # arguments is not specific about its inner type we infer it # as being equal. (See the List type for more info). if isclass(other) and issubclass(other, Struct): return True elif isinstance(other, Struct): return self.fields == other.fields else: return False def __hash__(self) -> int: return hash((self.__class__, tuple(self.fields))) def __iter__(self) -> Iterator[tuple[str, PolarsDataType]]: for fld in self.fields: yield fld.name, fld.dtype def __reversed__(self) -> Iterator[tuple[str, PolarsDataType]]: for fld in reversed(self.fields): yield fld.name, fld.dtype def __repr__(self) -> str: class_name = self.__class__.__name__ return f"{class_name}({dict(self)})" def to_schema(self) -> OrderedDict[str, PolarsDataType]: """Return Struct dtype as a schema dict.""" return OrderedDict(self)