data#

outlier_contamination#

Contaminates with outliers a data matrix.

Parameters (inputs)
----------
X: a pandas/polars series. It represents a statistical variable.
col: the name of a column of `X`.
prop_below: proportion of outliers generated in the below part of `X`. Only used if below = True.
prop_above: proportion of outliers generated in the above part of `X`. Only used if above = True.
sigma: parameter that controls the upper bound of the generated above outliers and the lower bound of the lower outliers.
random_state: controls the random seed of the random elements.

Returns (outputs)
-------
X_new: the resulting variable after the outlier contamination of `X`.
outlier_idx_below: the index of the below outliers.
outlier_idx_above: the index of the above outliers.

Example#

import pandas as pd
import polars as pl
from sklearn.datasets import make_blobs
from FastKmedoids.data import outlier_contamination
from BigEDA.descriptive import outliers_table
from BigEDA.plots import boxplot_matrix
X, Y = make_blobs(n_samples=35000, centers=4, cluster_std=[2,2,2,3], n_features=8, random_state=123)
X = pd.DataFrame(X)      
X.columns = [f"X{i}" for i in range(1, X.shape[1]+1)]

# Se convierten dos variables cuantitativas a binarias, y otras dos a multiclase, discretizandolas.
X['X5'] = pd.cut(X['X5'], bins=[X['X5'].min()-1, X['X5'].mean(), X['X5'].max()+1], labels=False)
X['X6'] = pd.cut(X['X6'], bins=[X['X6'].min()-1, X['X6'].mean(), X['X6'].max()+1], labels=False)
X['X7'] = pd.cut(X['X7'], bins=[X['X7'].min()-1, X['X7'].quantile(0.25), X['X7'].quantile(0.50), X['X7'].quantile(0.75), X['X7'].max()+1], labels=False)
X['X8'] = pd.cut(X['X8'], bins=[X['X8'].min()-1, X['X8'].quantile(0.25), X['X8'].quantile(0.50), X['X8'].quantile(0.75), X['X8'].max()+1], labels=False)   

X_outliers, outliers_idx_X1 = outlier_contamination(X, col_name='X1', prop_above=0.1, sigma=3, random_state=123)
X_outliers, outliers_idx_X2 = outlier_contamination(X_outliers, col_name='X2', prop_below=0.1, sigma=5, random_state=123)
X_outliers_pl = pl.from_pandas(X_outliers)
X_not_outliers_pl = pl.from_pandas(X)

X = X_outliers.copy()
outliers_table(X_not_outliers_pl, auto=False, col_names=['X1', 'X2', 'X3', 'X4'], h=1.5)

quant_variables

lower_bound

upper_bound

n_outliers

n_not_outliers

prop_outliers

prop_not_outliers

“X1”

-14.782543

15.560064

0

35000

0.0

1.0

“X2”

-11.860462

3.64293

144

34856

0.004114

0.995886

“X3”

-11.847622

6.501432

43

34957

0.001229

0.998771

“X4”

-10.074609

11.152553

519

34481

0.014829

0.985171

outliers_table(X_outliers_pl, auto=False, col_names=['X1', 'X2', 'X3', 'X4'], h=1.5)

quant_variables

lower_bound

upper_bound

n_outliers

n_not_outliers

prop_outliers

prop_not_outliers

“X1”

-17.363535

16.862265

1657

33343

0.047343

0.952657

“X2”

-13.537163

4.267825

3468

31532

0.099086

0.900914

“X3”

-11.847622

6.501432

43

34957

0.001229

0.998771

“X4”

-10.074609

11.152553

519

34481

0.014829

0.985171

boxplot_matrix(X_not_outliers_pl, n_cols=2, title='Box-plot - Quantitative variables - Before outliers contamination', 
               figsize=(10,5), quant_col_names=['X1', 'X2', 'X3', 'X4'], n_xticks=6, title_fontsize=13, 
               save=False, x_rotation=0, title_height=0.99, style='whitegrid', hspace=0.7, wspace=0.15, 
               title_weight='bold', subtitles_fontsize=12, xlabel_size=10)

My Local Image

boxplot_matrix(X_outliers_pl, n_cols=2, title='Box-plot - Quantitative variables - After outliers contamination', 
               figsize=(10,5), quant_col_names=['X1', 'X2', 'X3', 'X4'], n_xticks=6, title_fontsize=13, 
               save=False, x_rotation=0, title_height=0.99, style='whitegrid', hspace=0.7, wspace=0.15, 
               title_weight='bold', subtitles_fontsize=12, xlabel_size=10)

My Local Image