Source code for tidypolars.funs

import polars as pl
from .tibble import from_polars, Tibble
from .utils import (
    _as_list,
    _col_expr,
    _col_exprs,
    _is_constant,
    _is_list,
    _is_iterable,
    _is_series
)

__all__ = [
    # General functions
    "abs",
    "across",
    "case_when",
    "coalesce",
    "floor",
    "if_else",
    "lag", "lead",
    "log", "log10",
    "read_csv", "read_parquet",
    "rep",
    "replace_null",
    "round",
    "row_number",
    "sqrt",

    # Agg stats
    "cor", "cov", "count", "first", "last", "length",
    "max", "mean", "median", "min", "n",
    "n_distinct", "quantile", "sd", "sum", "var",

    # Predicates
    "between", "is_finite", "is_in", "is_infinite",
    "is_nan", "is_not", "is_not_in", "is_not_null", "is_null",

    # Type conversion
    "as_boolean", "as_float", "as_integer", "as_string",
    "cast"
]

[docs]def across(cols, fn = lambda x: x, names_prefix = None): """ Apply a function across a selection of columns Parameters ---------- cols : list Columns to operate on fn : lambda A function or lambda to apply to each column names_prefix : Optional - str Prefix to append to changed columns Examples -------- >>> df = tp.Tibble(x = ['a', 'a', 'b'], y = range(3), z = range(3)) >>> df.mutate(across(['y', 'z'], lambda x: x * 2)) >>> df.mutate(across(tp.Int64, lambda x: x * 2, names_prefix = "double_")) >>> df.summarize(across(['y', 'z'], tp.mean), by = 'x') """ _cols = _col_exprs(_as_list(cols)) exprs = [fn(_col) for _col in _cols] if names_prefix != None: exprs = [expr.prefix(names_prefix) for expr in exprs] return exprs
[docs]def as_boolean(x): """ Convert to a boolean Parameters ---------- x : Expr Column to operate on Examples -------- >>> df.mutate(bool_x = tp.as_boolean(col('x'))) """ x = _col_expr(x) return x.cast(pl.Boolean)
[docs]def as_float(x): """ Convert to float. Defaults to Float64. Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.mutate(float_x = tp.as_float(col('x'))) """ x = _col_expr(x) return x.cast(pl.Float64)
[docs]def as_integer(x): """ Convert to integer. Defaults to Int64. Parameters ---------- x : Expr Column to operate on Examples -------- >>> df.mutate(int_x = tp.as_integer(col('x'))) """ x = _col_expr(x) return x.cast(pl.Int64)
[docs]def as_string(x): """ Convert to string. Defaults to Utf8. Parameters ---------- x : Expr Column to operate on Examples -------- >>> df.mutate(string_x = tp.as_string(col('x'))) """ x = _col_expr(x) return x.cast(pl.Utf8)
[docs]def abs(x): """ Absolute value Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.mutate(abs_x = tp.abs('x')) >>> df.mutate(abs_x = tp.abs(col('x'))) """ x = _col_expr(x) return x.abs()
[docs]def between(x, left, right): """ Test if values of a column are between two values Parameters ---------- x : Expr, Series Column to operate on left : int Value to test if column is greater than or equal to right : int Value to test if column is less than or equal to Examples -------- >>> df = tp.Tibble(x = range(4)) >>> df.filter(tp.between(col('x'), 1, 3)) """ x = _col_expr(x) return x.is_between(left, right, include_bounds = True)
[docs]def case_when(expr): """ Case when Parameters ---------- expr : Expr A logical expression Examples -------- >>> df = tp.Tibble(x = range(1, 4)) >>> df.mutate( >>> case_x = tp.case_when(col('x') < 2).then(1) >>> .when(col('x') < 3).then(2) >>> .otherwise(0) >>> ) """ return pl.when(expr)
[docs]def cast(x, dtype): """ General type conversion. Parameters ---------- x : Expr, Series Column to operate on dtype : DataType Type to convert to Examples -------- >>> df.mutate(abs_x = tp.cast(col('x'), tp.Float64)) """ x = _col_expr(x) return x.cast(dtype)
[docs]def coalesce(*args): """ Coalesce missing values Parameters ---------- args : Expr Columns to coalesce Examples -------- >>> df.mutate(abs_x = tp.cast(col('x'), tp.Float64)) """ args = _as_list(args) expr = if_else(args[0].is_null(), args[1], args[0]) if len(args) > 2: locs = range(2, len(args)) for i in locs: expr = if_else(expr.is_null(), args[i], expr) return expr
[docs]def cor(x, y, method = 'pearson'): """ Find the correlation of two columns Parameters ---------- x : Expr A column y : Expr A column method : str Type of correlation to find. Either 'pearson' or 'spearman'. Examples -------- >>> df.summarize(cor = tp.cor(col('x'), col('y'))) """ if method == 'pearson': out = pl.pearson_corr(x, y) elif method == 'spearman': out = pl.spearman_rank_corr(x, y) else: ValueError("`method` must be either 'pearson' or 'spearman'") return out
[docs]def cov(x, y): """ Find the covariance of two columns Parameters ---------- x : Expr A column y : Expr A column Examples -------- >>> df.summarize(cor = tp.cov(col('x'), col('y'))) """ return pl.cov(x, y)
[docs]def count(x): """ Number of observations in each group Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(count = tp.count(col('x'))) """ x = _col_expr(x) return x.count()
[docs]def first(x): """ Get first value Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(first_x = tp.first('x')) >>> df.summarize(first_x = tp.first(col('x'))) """ x = _col_expr(x) return x.first()
[docs]def floor(x): """ Round numbers down to the lower integer Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.mutate(floor_x = tp.floor(col('x'))) """ x = _col_expr(x) return x.floor()
[docs]def if_else(condition, true, false): """ If Else Parameters ---------- condition : Expr A logical expression true : Value if the condition is true false : Value if the condition is false Examples -------- >>> df = tp.Tibble(x = range(1, 4)) >>> df.mutate(if_x = tp.if_else(col('x') < 2, 1, 2)) """ return pl.when(condition).then(true).otherwise(false)
[docs]def is_finite(x): """ Test if values of a column are finite Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df = tp.Tibble(x = [1.0, float('inf')]) >>> df.filter(tp.is_finite(col('x'))) """ x = _col_expr(x) return x.is_finite()
[docs]def is_in(x, y): """ Test if values of a column are in a list of values Parameters ---------- x : Expr, Series Column to operate on y : list List to test against Examples -------- >>> df = tp.Tibble(x = range(3)) >>> df.filter(tp.is_in(col('x'), [1, 2])) """ x = _col_expr(x) return x.is_in(y)
[docs]def is_infinite(x): """ Test if values of a column are infinite Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df = tp.Tibble(x = [1.0, float('inf')]) >>> df.filter(tp.is_infinite(col('x'))) """ x = _col_expr(x) return x.is_infinite()
[docs]def is_not(x): """ Flip values of a boolean series Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df = tp.Tibble(x = range(3)) >>> df.filter(tp.is_not(col('x') < 2)) """ x = _col_expr(x) return x.is_not()
[docs]def is_nan(x): """ Test if values of a column are nan Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df = tp.Tibble(x = range(3)) >>> df.filter(tp.is_nan(col('x'))) """ x = _col_expr(x) return x.is_nan()
[docs]def is_not_in(x, y): """ Test if values of a column are not in a list of values Parameters ---------- x : Expr, Series Column to operate on y : list List to test against Examples -------- >>> df = tp.Tibble(x = range(3)) >>> df.filter(tp.is_not_in(col('x'), [1, 2])) """ x = _col_expr(x) return x.is_in(y).is_not()
[docs]def is_not_null(x): """ Test if values of a column are not null Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df = tp.Tibble(x = range(3)) >>> df.filter(tp.is_not_in(col('x'), [1, 2])) """ x = _col_expr(x) return x.is_null().is_not()
[docs]def is_null(x): """ Test if values of a column are null Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df = tp.Tibble(x = range(3)) >>> df.filter(tp.is_not_in(col('x'), [1, 2])) """ x = _col_expr(x) return x.is_null()
def _shift(x, n, default): if default == None: return x.shift(n) else: return x.shift_and_fill(n, default)
[docs]def lag(x, n: int = 1, default = None): """ Get lagging values Parameters ---------- x : Expr, Series Column to operate on n : int Number of positions to lag by default : optional Value to fill in missing values Examples -------- >>> df.mutate(lag_x = tp.lag(col('x'))) >>> df.mutate(lag_x = tp.lag('x')) """ x = _col_expr(x) return _shift(x, n, default)
[docs]def last(x): """ Get last value Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(last_x = tp.last('x')) >>> df.summarize(last_x = tp.last(col('x'))) """ x = _col_expr(x) return x.last()
[docs]def lead(x, n: int = 1, default = None): """ Get leading values Parameters ---------- x : Expr, Series Column to operate on n : int Number of positions to lead by default : optional Value to fill in missing values Examples -------- >>> df.mutate(lead_x = tp.lead(col('x'))) >>> df.mutate(lead_x = col('x').lead()) """ x = _col_expr(x) return _shift(x, -n, default)
[docs]def length(x): """ Number of observations in each group Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(length = tp.length(col('x'))) """ x = _col_expr(x) return x.count()
[docs]def log(x): """ Compute the natural logarithm of a column Parameters ---------- x : Expr Column to operate on Examples -------- >>> df.mutate(log = tp.log('x')) """ x = _col_expr(x) return x.log()
[docs]def log10(x): """ Compute the base 10 logarithm of a column Parameters ---------- x : Expr Column to operate on Examples -------- >>> df.mutate(log = tp.log10('x')) """ x = _col_expr(x) return x.log10()
[docs]def max(x): """ Get column max Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(max_x = tp.max('x')) >>> df.summarize(max_x = tp.max(col('x'))) """ x = _col_expr(x) return x.max()
[docs]def mean(x): """ Get column mean Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(mean_x = tp.mean('x')) >>> df.summarize(mean_x = tp.mean(col('x'))) """ x = _col_expr(x) return x.mean()
[docs]def median(x): """ Get column median Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(median_x = tp.median('x')) >>> df.summarize(median_x = tp.median(col('x'))) """ x = _col_expr(x) return x.median()
[docs]def min(x): """ Get column minimum Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(min_x = tp.min('x')) >>> df.summarize(min_x = tp.min(col('x'))) """ x = _col_expr(x) return x.min()
[docs]def n(): """ Number of observations in each group Examples -------- >>> df.summarize(count = tp.n()) """ return pl.count()
[docs]def n_distinct(x): """ Get number of distinct values in a column Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(min_x = tp.n_distinct('x')) >>> df.summarize(min_x = tp.n_distinct(col('x'))) """ x = _col_expr(x) return x.n_unique()
[docs]def quantile(x, quantile = .5): """ Get number of distinct values in a column Parameters ---------- x : Expr, Series Column to operate on quantile : float Quantile to return Examples -------- >>> df.summarize(quantile_x = tp.quantile('x', .25)) """ x = _col_expr(x) return x.quantile(quantile)
[docs]def read_csv(file: str, *args, **kwargs): """Simple wrapper around polars.read_csv""" return pl.read_csv(file, *args, **kwargs).pipe(from_polars)
[docs]def read_parquet(source: str, *args, **kwargs): """Simple wrapper around polars.read_parquet""" return pl.read_parquet(source, *args, **kwargs).pipe(from_polars)
[docs]def rep(x, times = 1): """ Replicate the values in x Parameters ---------- x : const, Series Value or Series to repeat times : int Number of times to repeat Examples -------- >>> tp.rep(1, 3) >>> tp.rep(pl.Series(range(3)), 3) """ if _is_constant(x): out = [x] elif _is_series(x): out = x.to_list() elif _is_list(x): out = x elif isinstance(x, Tibble): out = pl.concat([x for i in range(times)]).pipe(from_polars) elif _is_iterable(x): out = list(x) else: ValueError("Incompatible type") if _is_list(out): out = pl.Series(out * times) return out
[docs]def replace_null(x, replace = None): """ Replace null values Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df = tp.Tibble(x = [0, None], y = [None, None]) >>> df.mutate(x = tp.replace_null(col('x'), 1)) """ if replace == None: return x return x.fill_null(replace)
[docs]def round(x, decimals = 0): """ Get column standard deviation Parameters ---------- x : Expr, Series Column to operate on decimals : int Decimals to round to Examples -------- >>> df.mutate(x = tp.round(col('x'))) """ x = _col_expr(x) return x.round(decimals)
[docs]def row_number(): """ Return row number Examples -------- >>> df.mutate(row_num = tp.row_number()) """ return pl.first().cumcount() + 1
[docs]def sd(x): """ Get column standard deviation Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(sd_x = tp.sd('x')) >>> df.summarize(sd_x = tp.sd(col('x'))) """ x = _col_expr(x) return x.std()
[docs]def sqrt(x): """ Get column square root Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.mutate(sqrt_x = tp.sqrt('x')) """ x = _col_expr(x) return x.sqrt()
[docs]def sum(x): """ Get column sum Parameters ---------- x : Expr, Series Column to operate on Examples -------- >>> df.summarize(sum_x = tp.sum('x')) >>> df.summarize(sum_x = tp.sum(col('x'))) """ x = _col_expr(x) return x.sum()
[docs]def var(x): """ Get column variance Parameters ---------- x : Expr Column to operate on Examples -------- >>> df.summarize(sum_x = tp.var('x')) >>> df.summarize(sum_x = tp.var(col('x'))) """ x = _col_expr(x) return x.var()