import polars as pl
from .tibble_df import from_polars, tibble
from .utils import (
_as_list,
_col_expr,
_col_exprs,
_is_constant,
_is_list,
_is_iterable,
_is_series,
_is_string,
_str_to_lit
)
__all__ = [
# General functions
"abs",
"across",
"case_when",
"coalesce",
"floor",
"if_else",
"lag", "lead",
"log", "log10",
"read_csv", "read_parquet",
"rep",
"replace_null",
"round",
"row_number",
"sqrt",
# Agg stats
"cor", "cov", "count", "first", "last", "length",
"max", "mean", "median", "min", "n",
"n_distinct", "quantile", "sd", "sum", "var",
# Predicates
"between", "is_finite", "is_in", "is_infinite",
"is_nan", "is_not", "is_not_in", "is_not_null", "is_null",
# Type conversion
"as_boolean", "as_float", "as_integer", "as_string",
"cast"
]
[docs]
def across(cols, fn = lambda x: x, names_prefix = None):
"""
Apply a function across a selection of columns
Parameters
----------
cols : list
Columns to operate on
fn : lambda
A function or lambda to apply to each column
names_prefix : Optional - str
Prefix to append to changed columns
Examples
--------
>>> df = tp.tibble(x = ['a', 'a', 'b'], y = range(3), z = range(3))
>>> df.mutate(across(['y', 'z'], lambda x: x * 2))
>>> df.mutate(across(tp.Int64, lambda x: x * 2, names_prefix = "double_"))
>>> df.summarize(across(['y', 'z'], tp.mean), by = 'x')
"""
_cols = _col_exprs(_as_list(cols))
exprs = [fn(_col) for _col in _cols]
if names_prefix != None:
exprs = [expr.name.prefix(names_prefix) for expr in exprs]
return exprs
[docs]
def as_boolean(x):
"""
Convert to a boolean
Parameters
----------
x : Expr
Column to operate on
Examples
--------
>>> df.mutate(bool_x = tp.as_boolean(col('x')))
"""
x = _col_expr(x)
return x.cast(pl.Boolean)
[docs]
def as_float(x):
"""
Convert to float. Defaults to Float64.
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df.mutate(float_x = tp.as_float(col('x')))
"""
x = _col_expr(x)
return x.cast(pl.Float64)
[docs]
def as_integer(x):
"""
Convert to integer. Defaults to Int64.
Parameters
----------
x : Expr
Column to operate on
Examples
--------
>>> df.mutate(int_x = tp.as_integer(col('x')))
"""
x = _col_expr(x)
return x.cast(pl.Int64)
[docs]
def as_string(x):
"""
Convert to string. Defaults to Utf8.
Parameters
----------
x : Expr
Column to operate on
Examples
--------
>>> df.mutate(string_x = tp.as_string(col('x')))
"""
x = _col_expr(x)
return x.cast(pl.Utf8)
[docs]
def abs(x):
"""
Absolute value
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df.mutate(abs_x = tp.abs('x'))
>>> df.mutate(abs_x = tp.abs(col('x')))
"""
x = _col_expr(x)
return x.abs()
[docs]
def between(x, left, right):
"""
Test if values of a column are between two values
Parameters
----------
x : Expr, Series
Column to operate on
left : int
Value to test if column is greater than or equal to
right : int
Value to test if column is less than or equal to
Examples
--------
>>> df = tp.tibble(x = range(4))
>>> df.filter(tp.between(col('x'), 1, 3))
"""
x = _col_expr(x)
return x.is_between(left, right)
[docs]
def case_when(*args, _default = pl.Null):
"""
Case when
Parameters
----------
expr : Expr
A logical expression
Examples
--------
>>> df = tp.tibble(x = range(1, 4))
>>> df.mutate(
>>> case_x = tp.case_when(col('x') < 2, 1,
>>> col('x') < 3, 2,
>>> _default = 0)
>>> )
"""
conditions = [args[i] for i in range(0, len(args), 2)]
values = [args[i] for i in range(1, len(args), 2)]
values = [_str_to_lit(value) for value in values]
for i in range(len(conditions)):
if i == 0:
expr = pl.when(conditions[i]).then(values[i])
else:
expr = expr.when(conditions[i]).then(values[i])
_default = _str_to_lit(_default)
expr = expr.otherwise(_default)
return expr
[docs]
def cast(x, dtype):
"""
General type conversion.
Parameters
----------
x : Expr, Series
Column to operate on
dtype : DataType
Type to convert to
Examples
--------
>>> df.mutate(float_x = tp.cast(col('x'), tp.Float64))
"""
x = _col_expr(x)
return x.cast(dtype)
[docs]
def coalesce(*args):
"""
Coalesce missing values
Parameters
----------
args : Expr
Columns to coalesce
Examples
--------
>>> df.mutate(coalesce_xy = tp.coalesce(col('x'), col('y')))
"""
args = _as_list(args)
expr = if_else(args[0].is_null(), args[1], args[0])
if len(args) > 2:
locs = range(2, len(args))
for i in locs:
expr = if_else(expr.is_null(), args[i], expr)
return expr
[docs]
def cor(x, y, method = 'pearson'):
"""
Find the correlation of two columns
Parameters
----------
x : Expr
A column
y : Expr
A column
method : str
Type of correlation to find. Either 'pearson' or 'spearman'.
Examples
--------
>>> df.summarize(cor = tp.cor(col('x'), col('y')))
"""
if pl.Series([method]).is_in(['pearson', 'spearman']).not_().item():
ValueError("`method` must be either 'pearson' or 'spearman'")
return pl.corr(x, y, method = method)
[docs]
def cov(x, y):
"""
Find the covariance of two columns
Parameters
----------
x : Expr
A column
y : Expr
A column
Examples
--------
>>> df.summarize(cov = tp.cov(col('x'), col('y')))
"""
return pl.cov(x, y)
[docs]
def count(x):
"""
Number of observations in each group
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df.summarize(count = tp.count(col('x')))
"""
x = _col_expr(x)
return x.count()
[docs]
def first(x):
"""
Get first value
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df.summarize(first_x = tp.first('x'))
>>> df.summarize(first_x = tp.first(col('x')))
"""
x = _col_expr(x)
return x.first()
[docs]
def floor(x):
"""
Round numbers down to the lower integer
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df.mutate(floor_x = tp.floor(col('x')))
"""
x = _col_expr(x)
return x.floor()
[docs]
def if_else(condition, true, false):
"""
If Else
Parameters
----------
condition : Expr
A logical expression
true :
Value if the condition is true
false :
Value if the condition is false
Examples
--------
>>> df = tp.tibble(x = range(1, 4))
>>> df.mutate(if_x = tp.if_else(col('x') < 2, 1, 2))
"""
return case_when(condition, true, _default = false)
[docs]
def is_finite(x):
"""
Test if values of a column are finite
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df = tp.tibble(x = [1.0, float('inf')])
>>> df.filter(tp.is_finite(col('x')))
"""
x = _col_expr(x)
return x.is_finite()
[docs]
def is_in(x, y):
"""
Test if values of a column are in a list of values
Parameters
----------
x : Expr, Series
Column to operate on
y : list
List to test against
Examples
--------
>>> df = tp.tibble(x = range(3))
>>> df.filter(tp.is_in(col('x'), [1, 2]))
"""
x = _col_expr(x)
return x.is_in(y)
[docs]
def is_infinite(x):
"""
Test if values of a column are infinite
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df = tp.tibble(x = [1.0, float('inf')])
>>> df.filter(tp.is_infinite(col('x')))
"""
x = _col_expr(x)
return x.is_infinite()
[docs]
def is_not(x):
"""
Flip values of a boolean series
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df = tp.tibble(x = range(3))
>>> df.filter(tp.is_not(col('x') < 2))
"""
x = _col_expr(x)
return x.not_()
[docs]
def is_nan(x):
"""
Test if values of a column are nan
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df = tp.tibble(x = range(3))
>>> df.filter(tp.is_nan(col('x')))
"""
x = _col_expr(x)
return x.is_nan()
[docs]
def is_not_in(x, y):
"""
Test if values of a column are not in a list of values
Parameters
----------
x : Expr, Series
Column to operate on
y : list
List to test against
Examples
--------
>>> df = tp.tibble(x = range(3))
>>> df.filter(tp.is_not_in(col('x'), [1, 2]))
"""
x = _col_expr(x)
return x.is_in(y).not_()
[docs]
def is_not_null(x):
"""
Test if values of a column are not null
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df = tp.tibble(x = range(3))
>>> df.filter(tp.is_not_null(col('x'), [1, 2]))
"""
x = _col_expr(x)
return x.is_null().not_()
[docs]
def is_null(x):
"""
Test if values of a column are null
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df = tp.tibble(x = range(3))
>>> df.filter(tp.is_null(col('x')))
"""
x = _col_expr(x)
return x.is_null()
[docs]
def lag(x, n: int = 1, default = None):
"""
Get lagging values
Parameters
----------
x : Expr, Series
Column to operate on
n : int
Number of positions to lag by
default : optional
Value to fill in missing values
Examples
--------
>>> df.mutate(lag_x = tp.lag(col('x')))
>>> df.mutate(lag_x = tp.lag('x'))
"""
x = _col_expr(x)
return x.shift(n, fill_value = default)
[docs]
def last(x):
"""
Get last value
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df.summarize(last_x = tp.last('x'))
>>> df.summarize(last_x = tp.last(col('x')))
"""
x = _col_expr(x)
return x.last()
[docs]
def lead(x, n: int = 1, default = None):
"""
Get leading values
Parameters
----------
x : Expr, Series
Column to operate on
n : int
Number of positions to lead by
default : optional
Value to fill in missing values
Examples
--------
>>> df.mutate(lead_x = tp.lead(col('x')))
>>> df.mutate(lead_x = col('x').lead())
"""
x = _col_expr(x)
return x.shift(-n, fill_value = default)
[docs]
def length(x):
"""
Number of observations in each group
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df.summarize(length = tp.length(col('x')))
"""
x = _col_expr(x)
return x.count()
[docs]
def log(x):
"""
Compute the natural logarithm of a column
Parameters
----------
x : Expr
Column to operate on
Examples
--------
>>> df.mutate(log = tp.log('x'))
"""
x = _col_expr(x)
return x.log()
[docs]
def log10(x):
"""
Compute the base 10 logarithm of a column
Parameters
----------
x : Expr
Column to operate on
Examples
--------
>>> df.mutate(log = tp.log10('x'))
"""
x = _col_expr(x)
return x.log10()
[docs]
def max(x):
"""
Get column max
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df.summarize(max_x = tp.max('x'))
>>> df.summarize(max_x = tp.max(col('x')))
"""
x = _col_expr(x)
return x.max()
[docs]
def mean(x):
"""
Get column mean
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df.summarize(mean_x = tp.mean('x'))
>>> df.summarize(mean_x = tp.mean(col('x')))
"""
x = _col_expr(x)
return x.mean()
[docs]
def min(x):
"""
Get column minimum
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df.summarize(min_x = tp.min('x'))
>>> df.summarize(min_x = tp.min(col('x')))
"""
x = _col_expr(x)
return x.min()
[docs]
def n():
"""
Number of observations in each group
Examples
--------
>>> df.summarize(count = tp.n())
"""
return pl.len()
[docs]
def n_distinct(x):
"""
Get number of distinct values in a column
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df.summarize(min_x = tp.n_distinct('x'))
>>> df.summarize(min_x = tp.n_distinct(col('x')))
"""
x = _col_expr(x)
return x.n_unique()
[docs]
def quantile(x, quantile = .5):
"""
Get number of distinct values in a column
Parameters
----------
x : Expr, Series
Column to operate on
quantile : float
Quantile to return
Examples
--------
>>> df.summarize(quantile_x = tp.quantile('x', .25))
"""
x = _col_expr(x)
return x.quantile(quantile)
[docs]
def read_csv(file: str,
*args,
**kwargs):
"""Simple wrapper around polars.read_csv"""
return pl.read_csv(file, *args, **kwargs).pipe(from_polars)
[docs]
def read_parquet(source: str,
*args,
**kwargs):
"""Simple wrapper around polars.read_parquet"""
return pl.read_parquet(source, *args, **kwargs).pipe(from_polars)
[docs]
def rep(x, times = 1):
"""
Replicate the values in x
Parameters
----------
x : const, Series
Value or Series to repeat
times : int
Number of times to repeat
Examples
--------
>>> tp.rep(1, 3)
>>> tp.rep(pl.Series(range(3)), 3)
"""
if _is_constant(x):
out = [x]
elif _is_series(x):
out = x.to_list()
elif _is_list(x):
out = x
elif isinstance(x, tibble):
out = pl.concat([x for i in range(times)]).pipe(from_polars)
elif _is_iterable(x):
out = list(x)
else:
ValueError("Incompatible type")
if _is_list(out):
out = pl.Series(out * times)
return out
[docs]
def replace_null(x, replace = None):
"""
Replace null values
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df = tp.tibble(x = [0, None], y = [None, None])
>>> df.mutate(x = tp.replace_null(col('x'), 1))
"""
if replace == None: return x
return x.fill_null(replace)
[docs]
def round(x, decimals = 0):
"""
Get column standard deviation
Parameters
----------
x : Expr, Series
Column to operate on
decimals : int
Decimals to round to
Examples
--------
>>> df.mutate(x = tp.round(col('x')))
"""
x = _col_expr(x)
return x.round(decimals)
[docs]
def row_number():
"""
Return row number
Examples
--------
>>> df.mutate(row_num = tp.row_number())
"""
return pl.int_range(0, pl.len()) + 1
[docs]
def sd(x):
"""
Get column standard deviation
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df.summarize(sd_x = tp.sd('x'))
>>> df.summarize(sd_x = tp.sd(col('x')))
"""
x = _col_expr(x)
return x.std()
[docs]
def sqrt(x):
"""
Get column square root
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df.mutate(sqrt_x = tp.sqrt('x'))
"""
x = _col_expr(x)
return x.sqrt()
[docs]
def sum(x):
"""
Get column sum
Parameters
----------
x : Expr, Series
Column to operate on
Examples
--------
>>> df.summarize(sum_x = tp.sum('x'))
>>> df.summarize(sum_x = tp.sum(col('x')))
"""
x = _col_expr(x)
return x.sum()
[docs]
def var(x):
"""
Get column variance
Parameters
----------
x : Expr
Column to operate on
Examples
--------
>>> df.summarize(sum_x = tp.var('x'))
>>> df.summarize(sum_x = tp.var(col('x')))
"""
x = _col_expr(x)
return x.var()