Source code for tidypolars.funs

import polars as pl
from .tibble_df import from_polars, tibble
from .utils import (
    _as_list,
    _col_expr,
    _col_exprs,
    _is_constant,
    _is_list,
    _is_iterable,
    _is_series,
    _is_string,
    _str_to_lit
)

__all__ = [
    # General functions
    "abs",
    "across",
    "case_when",
    "coalesce",
    "floor",
    "if_else",
    "lag", "lead",
    "log", "log10",
    "read_csv", "read_parquet",
    "rep",
    "replace_null",
    "round",
    "row_number",
    "sqrt",

    # Agg stats
    "cor", "cov", "count", "first", "last", "length",
    "max", "mean", "median", "min", "n",
    "n_distinct", "quantile", "sd", "sum", "var",

    # Predicates
    "between", "is_finite", "is_in", "is_infinite",
    "is_nan", "is_not", "is_not_in", "is_not_null", "is_null",

    # Type conversion
    "as_boolean", "as_float", "as_integer", "as_string",
    "cast"
]


[docs]
def across(cols, fn = lambda x: x, names_prefix = None):
    """
    Apply a function across a selection of columns

    Parameters
    ----------
    cols : list
        Columns to operate on
    fn : lambda
        A function or lambda to apply to each column
    names_prefix : Optional - str
        Prefix to append to changed columns

    Examples
    --------
    >>> df = tp.tibble(x = ['a', 'a', 'b'], y = range(3), z = range(3))
    >>> df.mutate(across(['y', 'z'], lambda x: x * 2))
    >>> df.mutate(across(tp.Int64, lambda x: x * 2, names_prefix = "double_"))
    >>> df.summarize(across(['y', 'z'], tp.mean), by = 'x')
    """
    _cols = _col_exprs(_as_list(cols))
    exprs = [fn(_col) for _col in _cols]
    if names_prefix != None:
        exprs = [expr.name.prefix(names_prefix) for expr in exprs]
    return exprs



[docs]
def as_boolean(x):
    """
    Convert to a boolean

    Parameters
    ----------
    x : Expr
        Column to operate on

    Examples
    --------
    >>> df.mutate(bool_x = tp.as_boolean(col('x')))
    """
    x = _col_expr(x)
    return x.cast(pl.Boolean)



[docs]
def as_float(x):
    """
    Convert to float. Defaults to Float64.

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df.mutate(float_x = tp.as_float(col('x')))
    """
    x = _col_expr(x)
    return x.cast(pl.Float64)



[docs]
def as_integer(x):
    """
    Convert to integer. Defaults to Int64.

    Parameters
    ----------
    x : Expr
        Column to operate on

    Examples
    --------
    >>> df.mutate(int_x = tp.as_integer(col('x')))
    """
    x = _col_expr(x)
    return x.cast(pl.Int64)



[docs]
def as_string(x):
    """
    Convert to string. Defaults to Utf8.

    Parameters
    ----------
    x : Expr
        Column to operate on

    Examples
    --------
    >>> df.mutate(string_x = tp.as_string(col('x')))
    """
    x = _col_expr(x)
    return x.cast(pl.Utf8)



[docs]
def abs(x):
    """
    Absolute value

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df.mutate(abs_x = tp.abs('x'))
    >>> df.mutate(abs_x = tp.abs(col('x')))
    """
    x = _col_expr(x)
    return x.abs()



[docs]
def between(x, left, right):
    """
    Test if values of a column are between two values

    Parameters
    ----------
    x : Expr, Series
        Column to operate on
    left : int
        Value to test if column is greater than or equal to
    right : int
        Value to test if column is less than or equal to

    Examples
    --------
    >>> df = tp.tibble(x = range(4))
    >>> df.filter(tp.between(col('x'), 1, 3))
    """
    x = _col_expr(x)
    return x.is_between(left, right)



[docs]
def case_when(*args, _default = pl.Null):
    """
    Case when

    Parameters
    ----------
    expr : Expr
        A logical expression

    Examples
    --------
    >>> df = tp.tibble(x = range(1, 4))
    >>> df.mutate(
    >>>    case_x = tp.case_when(col('x') < 2, 1,
    >>>                          col('x') < 3, 2,
    >>>                          _default = 0)
    >>> )
    """
    conditions = [args[i] for i in range(0, len(args), 2)]
    values = [args[i] for i in range(1, len(args), 2)]
    values = [_str_to_lit(value) for value in values]
    for i in range(len(conditions)):
        if i == 0:
            expr = pl.when(conditions[i]).then(values[i])
        else:
            expr = expr.when(conditions[i]).then(values[i])
    _default = _str_to_lit(_default)
    expr = expr.otherwise(_default)
    return expr



[docs]
def cast(x, dtype):
    """
    General type conversion.

    Parameters
    ----------
    x : Expr, Series
        Column to operate on
    dtype : DataType
        Type to convert to

    Examples
    --------
    >>> df.mutate(float_x = tp.cast(col('x'), tp.Float64))
    """
    x = _col_expr(x)
    return x.cast(dtype)



[docs]
def coalesce(*args):
    """
    Coalesce missing values

    Parameters
    ----------
    args : Expr
        Columns to coalesce

    Examples
    --------
    >>> df.mutate(coalesce_xy = tp.coalesce(col('x'), col('y')))
    """
    args = _as_list(args)
    expr = if_else(args[0].is_null(), args[1], args[0])
    if len(args) > 2:
        locs = range(2, len(args))
        for i in locs:
            expr = if_else(expr.is_null(), args[i], expr)
    return expr



[docs]
def cor(x, y, method = 'pearson'):
    """
    Find the correlation of two columns

    Parameters
    ----------
    x : Expr
        A column
    y : Expr
        A column
    method : str
        Type of correlation to find. Either 'pearson' or 'spearman'.

    Examples
    --------
    >>> df.summarize(cor = tp.cor(col('x'), col('y')))
    """
    if pl.Series([method]).is_in(['pearson', 'spearman']).not_().item():
        ValueError("`method` must be either 'pearson' or 'spearman'")
    return pl.corr(x, y, method = method)



[docs]
def cov(x, y):
    """
    Find the covariance of two columns

    Parameters
    ----------
    x : Expr
        A column
    y : Expr
        A column

    Examples
    --------
    >>> df.summarize(cov = tp.cov(col('x'), col('y')))
    """
    return pl.cov(x, y)



[docs]
def count(x):
    """
    Number of observations in each group

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df.summarize(count = tp.count(col('x')))
    """
    x = _col_expr(x)
    return x.count()



[docs]
def first(x):
    """
    Get first value

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df.summarize(first_x = tp.first('x'))
    >>> df.summarize(first_x = tp.first(col('x')))
    """
    x = _col_expr(x)
    return x.first()



[docs]
def floor(x):
    """
    Round numbers down to the lower integer

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df.mutate(floor_x = tp.floor(col('x')))
    """
    x = _col_expr(x)
    return x.floor()



[docs]
def if_else(condition, true, false):
    """
    If Else

    Parameters
    ----------
    condition : Expr
        A logical expression
    true :
        Value if the condition is true
    false :
        Value if the condition is false

    Examples
    --------
    >>> df = tp.tibble(x = range(1, 4))
    >>> df.mutate(if_x = tp.if_else(col('x') < 2, 1, 2))
    """
    return case_when(condition, true, _default = false)



[docs]
def is_finite(x):
    """
    Test if values of a column are finite

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df = tp.tibble(x = [1.0, float('inf')])
    >>> df.filter(tp.is_finite(col('x')))
    """
    x = _col_expr(x)
    return x.is_finite()



[docs]
def is_in(x, y):
    """
    Test if values of a column are in a list of values

    Parameters
    ----------
    x : Expr, Series
        Column to operate on
    y : list
        List to test against

    Examples
    --------
    >>> df = tp.tibble(x = range(3))
    >>> df.filter(tp.is_in(col('x'), [1, 2]))
    """
    x = _col_expr(x)
    return x.is_in(y)



[docs]
def is_infinite(x):
    """
    Test if values of a column are infinite

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df = tp.tibble(x = [1.0, float('inf')])
    >>> df.filter(tp.is_infinite(col('x')))
    """
    x = _col_expr(x)
    return x.is_infinite()



[docs]
def is_not(x):
    """
    Flip values of a boolean series

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df = tp.tibble(x = range(3))
    >>> df.filter(tp.is_not(col('x') < 2))
    """
    x = _col_expr(x)
    return x.not_()



[docs]
def is_nan(x):
    """
    Test if values of a column are nan

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df = tp.tibble(x = range(3))
    >>> df.filter(tp.is_nan(col('x')))
    """
    x = _col_expr(x)
    return x.is_nan()



[docs]
def is_not_in(x, y):
    """
    Test if values of a column are not in a list of values

    Parameters
    ----------
    x : Expr, Series
        Column to operate on
    y : list
        List to test against

    Examples
    --------
    >>> df = tp.tibble(x = range(3))
    >>> df.filter(tp.is_not_in(col('x'), [1, 2]))
    """
    x = _col_expr(x)
    return x.is_in(y).not_()



[docs]
def is_not_null(x):
    """
    Test if values of a column are not null

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df = tp.tibble(x = range(3))
    >>> df.filter(tp.is_not_null(col('x'), [1, 2]))
    """
    x = _col_expr(x)
    return x.is_null().not_()



[docs]
def is_null(x):
    """
    Test if values of a column are null

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df = tp.tibble(x = range(3))
    >>> df.filter(tp.is_null(col('x')))
    """
    x = _col_expr(x)
    return x.is_null()



[docs]
def lag(x, n: int = 1, default = None):
    """
    Get lagging values

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    n : int
        Number of positions to lag by

    default : optional
        Value to fill in missing values

    Examples
    --------
    >>> df.mutate(lag_x = tp.lag(col('x')))
    >>> df.mutate(lag_x = tp.lag('x'))
    """
    x = _col_expr(x)
    return x.shift(n, fill_value = default)



[docs]
def last(x):
    """
    Get last value

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df.summarize(last_x = tp.last('x'))
    >>> df.summarize(last_x = tp.last(col('x')))
    """
    x = _col_expr(x)
    return x.last()



[docs]
def lead(x, n: int = 1, default = None):
    """
    Get leading values

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    n : int
        Number of positions to lead by

    default : optional
        Value to fill in missing values

    Examples
    --------
    >>> df.mutate(lead_x = tp.lead(col('x')))
    >>> df.mutate(lead_x = col('x').lead())
    """
    x = _col_expr(x)
    return x.shift(-n, fill_value = default)



[docs]
def length(x):
    """
    Number of observations in each group

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df.summarize(length = tp.length(col('x')))
    """
    x = _col_expr(x)
    return x.count()



[docs]
def log(x):
    """
    Compute the natural logarithm of a column

    Parameters
    ----------
    x : Expr
        Column to operate on

    Examples
    --------
    >>> df.mutate(log = tp.log('x'))
    """
    x = _col_expr(x)
    return x.log()



[docs]
def log10(x):
    """
    Compute the base 10 logarithm of a column

    Parameters
    ----------
    x : Expr
        Column to operate on

    Examples
    --------
    >>> df.mutate(log = tp.log10('x'))
    """
    x = _col_expr(x)
    return x.log10()



[docs]
def max(x):
    """
    Get column max

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df.summarize(max_x = tp.max('x'))
    >>> df.summarize(max_x = tp.max(col('x')))
    """
    x = _col_expr(x)
    return x.max()



[docs]
def mean(x):
    """
    Get column mean

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df.summarize(mean_x = tp.mean('x'))
    >>> df.summarize(mean_x = tp.mean(col('x')))
    """
    x = _col_expr(x)
    return x.mean()



[docs]
def median(x):
    """
    Get column median

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df.summarize(median_x = tp.median('x'))
    >>> df.summarize(median_x = tp.median(col('x')))
    """
    x = _col_expr(x)
    return x.median()



[docs]
def min(x):
    """
    Get column minimum

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df.summarize(min_x = tp.min('x'))
    >>> df.summarize(min_x = tp.min(col('x')))
    """
    x = _col_expr(x)
    return x.min()



[docs]
def n():
    """
    Number of observations in each group

    Examples
    --------
    >>> df.summarize(count = tp.n())
    """
    return pl.len()



[docs]
def n_distinct(x):
    """
    Get number of distinct values in a column

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df.summarize(min_x = tp.n_distinct('x'))
    >>> df.summarize(min_x = tp.n_distinct(col('x')))
    """
    x = _col_expr(x)
    return x.n_unique()



[docs]
def quantile(x, quantile = .5):
    """
    Get number of distinct values in a column

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    quantile : float
        Quantile to return

    Examples
    --------
    >>> df.summarize(quantile_x = tp.quantile('x', .25))
    """
    x = _col_expr(x)
    return x.quantile(quantile)



[docs]
def read_csv(file: str,
             *args,
             **kwargs):
    """Simple wrapper around polars.read_csv"""
    return pl.read_csv(file, *args, **kwargs).pipe(from_polars)



[docs]
def read_parquet(source: str,
                 *args,
                 **kwargs):
    """Simple wrapper around polars.read_parquet"""
    return pl.read_parquet(source, *args, **kwargs).pipe(from_polars)



[docs]
def rep(x, times = 1):
    """
    Replicate the values in x

    Parameters
    ----------
    x : const, Series
        Value or Series to repeat
    times : int
        Number of times to repeat

    Examples
    --------
    >>> tp.rep(1, 3)
    >>> tp.rep(pl.Series(range(3)), 3)
    """
    if _is_constant(x):
        out = [x]
    elif _is_series(x):
        out = x.to_list()
    elif _is_list(x):
        out = x
    elif isinstance(x, tibble):
        out = pl.concat([x for i in range(times)]).pipe(from_polars)
    elif _is_iterable(x):
        out = list(x)
    else:
        ValueError("Incompatible type")
    if _is_list(out):
        out = pl.Series(out * times)
    return out



[docs]
def replace_null(x, replace = None):
    """
    Replace null values

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df = tp.tibble(x = [0, None], y = [None, None])
    >>> df.mutate(x = tp.replace_null(col('x'), 1))
    """
    if replace == None: return x
    return x.fill_null(replace)



[docs]
def round(x, decimals = 0):
    """
    Get column standard deviation

    Parameters
    ----------
    x : Expr, Series
        Column to operate on
    decimals : int
        Decimals to round to

    Examples
    --------
    >>> df.mutate(x = tp.round(col('x')))
    """
    x = _col_expr(x)
    return x.round(decimals)



[docs]
def row_number():
    """
    Return row number

    Examples
    --------
    >>> df.mutate(row_num = tp.row_number())
    """
    return pl.int_range(0, pl.len()) + 1



[docs]
def sd(x):
    """
    Get column standard deviation

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df.summarize(sd_x = tp.sd('x'))
    >>> df.summarize(sd_x = tp.sd(col('x')))
    """
    x = _col_expr(x)
    return x.std()



[docs]
def sqrt(x):
    """
    Get column square root

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df.mutate(sqrt_x = tp.sqrt('x'))
    """
    x = _col_expr(x)
    return x.sqrt()



[docs]
def sum(x):
    """
    Get column sum

    Parameters
    ----------
    x : Expr, Series
        Column to operate on

    Examples
    --------
    >>> df.summarize(sum_x = tp.sum('x'))
    >>> df.summarize(sum_x = tp.sum(col('x')))
    """
    x = _col_expr(x)
    return x.sum()



[docs]
def var(x):
    """
    Get column variance

    Parameters
    ----------
    x : Expr
        Column to operate on

    Examples
    --------
    >>> df.summarize(sum_x = tp.var('x'))
    >>> df.summarize(sum_x = tp.var(col('x')))
    """
    x = _col_expr(x)
    return x.var()