Quick Start#

Imports and Paths#

import os, sys
import numpy as np


script_path = os.getcwd()
project_path = os.path.join(script_path, '..')
sys.path.append(project_path)

Installation#

# pip install db-robust-clust

To see the available versions of the package go to the release history at PyPi: https://pypi.org/project/db_robust_clust/#history

Data#

Variable

Description

Variable Type

Possible Categories / Range

latitude

Latitude of the house

Quantitative

24.86 - 25.27

longitude

Longitude of the house

Quantitative

55.06 - 55.44

price

Market price of the house

Quantitative

220000 - 35000000

price_per_sqft

Price per square foot

Quantitative

361.87 - 4805.87

size in sqft

Size in square feet

Quantitative

294 - 9576

no of bedrooms

Number of bedrooms in the house

Multiclass

0, 1, 2, 3, 4, 5

no of bathrooms

Number of bathrooms in the house

Multiclass

0, 1, 2, 3, 4, 5, 6

quality

Quality level of the house (response variable)

Binary

Low (0), Medium-High-UltraHigh (1)

balcony

Indicates if the house has a balcony

Binary

true (1), false (0)

barbecue area

Indicates if the house has a barbecue area

Binary

true (1), false (0)

private pool

Indicates if the house has a private pool

Binary

true (1), false (0)

private garden

Indicates if the house has a private garden

Binary

true (1), false (0)

Note: The quality variable is the response (target) variable. The remaining variables are mixed-type predictors.

import polars as pl

data_url = "https://raw.githubusercontent.com/FabioScielzoOrtiz/db_robust_clust-docu/refs/heads/main/data/dubai_houses_processed.csv"
df = pl.read_csv(data_url)

response = 'quality'
quant_predictors = ['latitude', 'longitude', 'price', 'size_in_sqft', 'price_per_sqft']
binary_predictors = ['balcony', 'barbecue_area', 'private_pool', 'private_garden']
multiclass_predictors = ['no_of_bedrooms', 'no_of_bathrooms']

y = df[response]
X = df[quant_predictors + binary_predictors + multiclass_predictors]
y.head()
shape: (10,)
quality
i64
1
1
1
0
1
1
1
1
0
1
X.head()
shape: (5, 11)
latitudelongitudepricesize_in_sqftprice_per_sqftbalconybarbecue_areaprivate_poolprivate_gardenno_of_bedroomsno_of_bathrooms
f64f64i64i64f64i64i64i64i64i64i64
25.11320855.138932270000010792502.32110012
25.10680955.151201285000015821801.52100022
25.06330255.13772811500001951589.44100035
25.22729555.341761285000020201410.89100023
25.11427555.13976417292005073410.65000001
n = len(X)
raw_weights = np.random.rand(n)
simulated_weights = raw_weights / np.sum(raw_weights)

db_robust_clust.models#

p1 = len(quant_predictors)
p2 = len(binary_predictors)
p3 = len(multiclass_predictors)

n_clusters = len(y.unique())

SampleDistClustering#

from db_robust_clust.models import SampleDistClustering

from kmedoids import KMedoids

KMedoids - pam#

kmedoids_method = 'pam'

clustering_method = KMedoids(
    n_clusters=n_clusters, 
    metric='precomputed', 
    method=kmedoids_method, 
    init='build', 
    max_iter=100, 
    random_state=123
)
metric = 'euclidean'#
metric = 'euclidean'

sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
)

sample_dist_clust.fit(X)
SampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                method='pam', n_clusters=2,
                                                random_state=123),
                     frac_sample_size=0.1, metric='euclidean',
                     random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0], dtype=uint64)
X_new = X[:10,:]

sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
np.unique(sample_dist_clust.labels_, return_counts=True)
(array([0, 1], dtype=uint64), array([1858,   47], dtype=int64))
metric = 'ggower'#
metric = 'ggower'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05
)

sample_dist_clust.fit(X)
SampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                method='pam', n_clusters=2,
                                                random_state=123),
                     d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                     frac_sample_size=0.1, metric='ggower', p1=5, p2=4, p3=2,
                     random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0], dtype=uint64)
X_new = X[:10,:]

sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 1, 1, 1, 1, 1, 0]
np.unique(sample_dist_clust.labels_, return_counts=True)
(array([0, 1], dtype=uint64), array([1380,  525], dtype=int64))
metric = 'ggower'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

weights = simulated_weights

sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05
)

sample_dist_clust.fit(X=X, weights=weights)
SampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                method='pam', n_clusters=2,
                                                random_state=123),
                     d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                     frac_sample_size=0.1, metric='ggower', p1=5, p2=4, p3=2,
                     random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0], dtype=uint64)
X_new = X[:10,:]

sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 1, 1, 1, 1, 1, 0]
np.unique(sample_dist_clust.labels_, return_counts=True)
(array([0, 1], dtype=uint64), array([1385,  520], dtype=int64))
metric = 'RelMS'#
metric = 'relms'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05, 
)

sample_dist_clust.fit(X)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-7.09e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-1.47e+01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
SampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                method='pam', n_clusters=2,
                                                random_state=123),
                     d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                     frac_sample_size=0.1, metric='relms', p1=5, p2=4, p3=2,
                     random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0], dtype=uint64)
X_new = X[:10,:]

sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 1, 1, 1, 1, 1, 0]
np.unique(sample_dist_clust.labels_, return_counts=True)
(array([0, 1], dtype=uint64), array([1382,  523], dtype=int64))

KMedoids - fasterpam#

kmedoids_method = 'fasterpam'

clustering_method = KMedoids(
    n_clusters=n_clusters, 
    metric='precomputed', 
    method=kmedoids_method, 
    init='build', 
    max_iter=100, 
    random_state=123
)
metric = 'euclidean'#
metric = 'euclidean'

sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
)

sample_dist_clust.fit(X)
SampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                n_clusters=2,
                                                random_state=123),
                     frac_sample_size=0.1, metric='euclidean',
                     random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0], dtype=uint64)
X_new = X[:10,:]

sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
np.unique(sample_dist_clust.labels_, return_counts=True)
(array([0, 1], dtype=uint64), array([1858,   47], dtype=int64))
metric = 'ggower'#
metric = 'ggower'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05
)

sample_dist_clust.fit(X)
SampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                n_clusters=2,
                                                random_state=123),
                     d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                     frac_sample_size=0.1, metric='ggower', p1=5, p2=4, p3=2,
                     random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0], dtype=uint64)
X_new = X[:10,:]

sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 1, 1, 1, 1, 1, 0]
np.unique(sample_dist_clust.labels_, return_counts=True)
(array([0, 1], dtype=uint64), array([1380,  525], dtype=int64))
metric = 'RelMS'#
metric = 'relms'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05, 
)

sample_dist_clust.fit(X)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-7.09e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-1.47e+01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
SampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                n_clusters=2,
                                                random_state=123),
                     d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                     frac_sample_size=0.1, metric='relms', p1=5, p2=4, p3=2,
                     random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0], dtype=uint64)
X_new = X[:10,:]

sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 1, 1, 1, 1, 1, 0]
np.unique(sample_dist_clust.labels_, return_counts=True)
(array([0, 1], dtype=uint64), array([1382,  523], dtype=int64))

KMedoids - alternate#

kmedoids_method = 'alternate'

clustering_method = KMedoids(
    n_clusters=n_clusters, 
    metric='precomputed', 
    method=kmedoids_method, 
    init='build', 
    max_iter=100, 
    random_state=123
)
metric = 'euclidean'#
metric = 'euclidean'

sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
)

sample_dist_clust.fit(X)
SampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                method='alternate',
                                                n_clusters=2,
                                                random_state=123),
                     frac_sample_size=0.1, metric='euclidean',
                     random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0], dtype=uint64)
X_new = X[:10,:]

sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
np.unique(sample_dist_clust.labels_, return_counts=True)
(array([0, 1], dtype=uint64), array([1858,   47], dtype=int64))
metric = 'ggower'#
metric = 'ggower'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05
)

sample_dist_clust.fit(X)
SampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                method='alternate',
                                                n_clusters=2,
                                                random_state=123),
                     d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                     frac_sample_size=0.1, metric='ggower', p1=5, p2=4, p3=2,
                     random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0], dtype=uint64)
X_new = X[:10,:]

sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 1, 1, 1, 1, 1, 0]
np.unique(sample_dist_clust.labels_, return_counts=True)
(array([0, 1], dtype=uint64), array([1380,  525], dtype=int64))
metric = 'RelMS'#
metric = 'relms'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05, 
)

sample_dist_clust.fit(X)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-7.09e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-1.47e+01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
SampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                method='alternate',
                                                n_clusters=2,
                                                random_state=123),
                     d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                     frac_sample_size=0.1, metric='relms', p1=5, p2=4, p3=2,
                     random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0], dtype=uint64)
X_new = X[:10,:]

sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 1, 1, 1, 1, 1, 0]
np.unique(sample_dist_clust.labels_, return_counts=True)
(array([0, 1], dtype=uint64), array([1382,  523], dtype=int64))

KMedoids - fastermsc#

kmedoids_method = 'fastermsc'

clustering_method = KMedoids(
    n_clusters=n_clusters, 
    metric='precomputed', 
    method=kmedoids_method, 
    init='build', 
    max_iter=100, 
    random_state=123
)
metric = 'euclidean'#
metric = 'euclidean'

sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
)

sample_dist_clust.fit(X)
SampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                method='fastermsc',
                                                n_clusters=2,
                                                random_state=123),
                     frac_sample_size=0.1, metric='euclidean',
                     random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0], dtype=uint64)
X_new = X[:10,:]

sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
np.unique(sample_dist_clust.labels_, return_counts=True)
(array([0, 1], dtype=uint64), array([1890,   15], dtype=int64))
metric = 'ggower'#
metric = 'ggower'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05
)

sample_dist_clust.fit(X)
SampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                method='fastermsc',
                                                n_clusters=2,
                                                random_state=123),
                     d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                     frac_sample_size=0.1, metric='ggower', p1=5, p2=4, p3=2,
                     random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0], dtype=uint64)
X_new = X[:10,:]

sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
np.unique(sample_dist_clust.labels_, return_counts=True)
(array([0, 1], dtype=uint64), array([1897,    8], dtype=int64))
metric = 'RelMS'#
metric = 'relms'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05, 
)

sample_dist_clust.fit(X)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-7.09e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-1.47e+01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
SampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                method='fastermsc',
                                                n_clusters=2,
                                                random_state=123),
                     d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                     frac_sample_size=0.1, metric='relms', p1=5, p2=4, p3=2,
                     random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0], dtype=uint64)
X_new = X[:10,:]

sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
np.unique(sample_dist_clust.labels_, return_counts=True)
(array([0, 1], dtype=uint64), array([1897,    8], dtype=int64))

KMedoids - pamsil#

kmedoids_method = 'pamsil'

clustering_method = KMedoids(
    n_clusters=n_clusters, 
    metric='precomputed', 
    method=kmedoids_method, 
    init='build', 
    max_iter=100, 
    random_state=123
)
metric = 'euclidean'#
metric = 'euclidean'

sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
)

sample_dist_clust.fit(X)
SampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                method='pamsil', n_clusters=2,
                                                random_state=123),
                     frac_sample_size=0.1, metric='euclidean',
                     random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sample_dist_clust.labels_
array([1, 1, 1, ..., 1, 1, 1], dtype=uint64)
X_new = X[:10,:]

sample_dist_clust.predict(X_new)
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
np.unique(sample_dist_clust.labels_, return_counts=True)
(array([0, 1], dtype=uint64), array([   6, 1899], dtype=int64))
metric = 'ggower'#
metric = 'ggower'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05
)

sample_dist_clust.fit(X)
SampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                method='pamsil', n_clusters=2,
                                                random_state=123),
                     d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                     frac_sample_size=0.1, metric='ggower', p1=5, p2=4, p3=2,
                     random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sample_dist_clust.labels_
array([1, 1, 1, ..., 1, 1, 1], dtype=uint64)
X_new = X[:10,:]

sample_dist_clust.predict(X_new)
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
np.unique(sample_dist_clust.labels_, return_counts=True)
(array([0, 1], dtype=uint64), array([  10, 1895], dtype=int64))
metric = 'RelMS'#
metric = 'relms'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05, 
)

sample_dist_clust.fit(X)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-7.09e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-1.47e+01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
SampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                method='pamsil', n_clusters=2,
                                                random_state=123),
                     d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                     frac_sample_size=0.1, metric='relms', p1=5, p2=4, p3=2,
                     random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sample_dist_clust.labels_
array([1, 1, 1, ..., 1, 1, 1], dtype=uint64)
X_new = X[:10,:]

sample_dist_clust.predict(X_new)
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
np.unique(sample_dist_clust.labels_, return_counts=True)
(array([0, 1], dtype=uint64), array([  10, 1895], dtype=int64))

KMedoids - pammedsil#

kmedoids_method = 'pammedsil'

clustering_method = KMedoids(
    n_clusters=n_clusters, 
    metric='precomputed', 
    method=kmedoids_method, 
    init='build', 
    max_iter=100, 
    random_state=123
)
metric = 'euclidean'#
metric = 'euclidean'

sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
)

sample_dist_clust.fit(X)
SampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                method='pammedsil',
                                                n_clusters=2,
                                                random_state=123),
                     frac_sample_size=0.1, metric='euclidean',
                     random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0], dtype=uint64)
X_new = X[:10,:]

sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
np.unique(sample_dist_clust.labels_, return_counts=True)
(array([0, 1], dtype=uint64), array([1890,   15], dtype=int64))
metric = 'ggower'#
metric = 'ggower'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05
)

sample_dist_clust.fit(X)
SampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                method='pammedsil',
                                                n_clusters=2,
                                                random_state=123),
                     d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                     frac_sample_size=0.1, metric='ggower', p1=5, p2=4, p3=2,
                     random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0], dtype=uint64)
X_new = X[:10,:]

sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
np.unique(sample_dist_clust.labels_, return_counts=True)
(array([0, 1], dtype=uint64), array([1897,    8], dtype=int64))
metric = 'RelMS'#
metric = 'relms'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05, 
)

sample_dist_clust.fit(X)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-7.09e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-1.47e+01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
SampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                method='pammedsil',
                                                n_clusters=2,
                                                random_state=123),
                     d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                     frac_sample_size=0.1, metric='relms', p1=5, p2=4, p3=2,
                     random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0], dtype=uint64)
X_new = X[:10,:]

sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
np.unique(sample_dist_clust.labels_, return_counts=True)
(array([0, 1], dtype=uint64), array([1897,    8], dtype=int64))

FoldSampleDistClustering#

from db_robust_clust.models import FoldSampleDistClustering

from kmedoids import KMedoids

KMedoids - pam#

kmedoids_method = 'pam'

clustering_method = KMedoids(
    n_clusters=n_clusters, 
    metric='precomputed', 
    method=kmedoids_method, 
    init='build', 
    max_iter=100, 
    random_state=123
)
metric = 'euclidean'#
metric = 'euclidean'

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
)

fold_sample_dist_clust.fit(X)
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    method='pam', n_clusters=2,
                                                    random_state=123),
                         metric='euclidean', random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([1886,   19], dtype=int64))
metric = 'ggower'#
metric = 'ggower'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    meta_frac_sample_size=0.8,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05,
)

fold_sample_dist_clust.fit(X)
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    method='pam', n_clusters=2,
                                                    random_state=123),
                         d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                         metric='ggower', p1=5, p2=4, p3=2, random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([1, 1, 1, ..., 1, 1, 1])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[1, 0, 0, 1, 0, 0, 0, 0, 0, 1]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([ 554, 1351], dtype=int64))
metric = 'ggower'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

weights = simulated_weights

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05
)

fold_sample_dist_clust.fit(X=X, weights=weights)
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    method='pam', n_clusters=2,
                                                    random_state=123),
                         d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                         metric='ggower', p1=5, p2=4, p3=2, random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([1, 1, 1, ..., 1, 1, 1])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[1, 1, 0, 1, 0, 0, 0, 0, 0, 1]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([ 861, 1044], dtype=int64))
metric = 'RelMS'#
metric = 'relms'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05
)

fold_sample_dist_clust.fit(X)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.00e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-2.86e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.20e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-1.83e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.20e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-3.10e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-2.03e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-1.28e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    method='pam', n_clusters=2,
                                                    random_state=123),
                         d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                         metric='relms', p1=5, p2=4, p3=2, random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([1310,  595], dtype=int64))
metric = 'relms'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

weights = simulated_weights

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05
)

fold_sample_dist_clust.fit(X=X, weights=weights)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.00e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-2.86e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.20e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-1.83e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.20e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-3.10e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-2.03e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-6.74e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    method='pam', n_clusters=2,
                                                    random_state=123),
                         d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                         metric='relms', p1=5, p2=4, p3=2, random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 0, 0, 1, 0, 1, 0]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([1509,  396], dtype=int64))

KMedoids - fasterpam#

kmedoids_method = 'fasterpam'

clustering_method = KMedoids(
    n_clusters=n_clusters, 
    metric='precomputed', 
    method=kmedoids_method, 
    init='build', 
    max_iter=100, 
    random_state=123
)
metric = 'euclidean'#
metric = 'euclidean'

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
)

fold_sample_dist_clust.fit(X)
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    n_clusters=2,
                                                    random_state=123),
                         metric='euclidean', random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([1886,   19], dtype=int64))
metric = 'ggower'#
metric = 'ggower'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    meta_frac_sample_size=0.8,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05,
)

fold_sample_dist_clust.fit(X)
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    n_clusters=2,
                                                    random_state=123),
                         d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                         metric='ggower', p1=5, p2=4, p3=2, random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([1, 1, 1, ..., 1, 1, 1])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[1, 0, 0, 1, 0, 0, 0, 0, 0, 1]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([ 554, 1351], dtype=int64))
metric = 'RelMS'#
metric = 'relms'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05
)

fold_sample_dist_clust.fit(X)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.00e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-2.86e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.20e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-1.83e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.20e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-3.10e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-2.03e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-1.28e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    n_clusters=2,
                                                    random_state=123),
                         d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                         metric='relms', p1=5, p2=4, p3=2, random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([1310,  595], dtype=int64))

KMedoids - alternate#

kmedoids_method = 'alternate'

clustering_method = KMedoids(
    n_clusters=n_clusters, 
    metric='precomputed', 
    method=kmedoids_method, 
    init='build', 
    max_iter=100, 
    random_state=123
)
metric = 'euclidean'#
metric = 'euclidean'

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
)

fold_sample_dist_clust.fit(X)
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    method='alternate',
                                                    n_clusters=2,
                                                    random_state=123),
                         metric='euclidean', random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([1886,   19], dtype=int64))
metric = 'ggower'#
metric = 'ggower'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    meta_frac_sample_size=0.8,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05,
)

fold_sample_dist_clust.fit(X)
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    method='alternate',
                                                    n_clusters=2,
                                                    random_state=123),
                         d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                         metric='ggower', p1=5, p2=4, p3=2, random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([1, 1, 1, ..., 1, 1, 1])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[1, 0, 0, 1, 0, 0, 0, 0, 0, 1]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([ 554, 1351], dtype=int64))
metric = 'RelMS'#
metric = 'relms'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05
)

fold_sample_dist_clust.fit(X)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.00e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-2.86e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.20e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-1.83e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.20e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-3.10e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-2.03e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-1.28e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    method='alternate',
                                                    n_clusters=2,
                                                    random_state=123),
                         d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                         metric='relms', p1=5, p2=4, p3=2, random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([1310,  595], dtype=int64))

KMedoids - fastermsc#

kmedoids_method = 'fastermsc'

clustering_method = KMedoids(
    n_clusters=n_clusters, 
    metric='precomputed', 
    method=kmedoids_method, 
    init='build', 
    max_iter=100, 
    random_state=123
)
metric = 'euclidean'#
metric = 'euclidean'

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
)

fold_sample_dist_clust.fit(X)
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    method='fastermsc',
                                                    n_clusters=2,
                                                    random_state=123),
                         metric='euclidean', random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([1, 1, 1, ..., 1, 1, 1])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([  19, 1886], dtype=int64))
metric = 'ggower'#
metric = 'ggower'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    meta_frac_sample_size=0.8,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05,
)

fold_sample_dist_clust.fit(X)
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    method='fastermsc',
                                                    n_clusters=2,
                                                    random_state=123),
                         d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                         metric='ggower', p1=5, p2=4, p3=2, random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([1877,   28], dtype=int64))
metric = 'RelMS'#
metric = 'relms'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05
)

fold_sample_dist_clust.fit(X)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.00e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-2.86e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.20e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-1.83e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.20e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-3.10e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-2.03e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-1.08e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    method='fastermsc',
                                                    n_clusters=2,
                                                    random_state=123),
                         d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                         metric='relms', p1=5, p2=4, p3=2, random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 1, 1, 1, 1, 1, 0]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([1596,  309], dtype=int64))

KMedoids - pamsil#

kmedoids_method = 'pamsil'

clustering_method = KMedoids(
    n_clusters=n_clusters, 
    metric='precomputed', 
    method=kmedoids_method, 
    init='build', 
    max_iter=100, 
    random_state=123
)
metric = 'euclidean'#
metric = 'euclidean'

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
)

fold_sample_dist_clust.fit(X)
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    method='pamsil',
                                                    n_clusters=2,
                                                    random_state=123),
                         metric='euclidean', random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([1886,   19], dtype=int64))
metric = 'ggower'#
metric = 'ggower'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    meta_frac_sample_size=0.8,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05,
)

fold_sample_dist_clust.fit(X)
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    method='pamsil',
                                                    n_clusters=2,
                                                    random_state=123),
                         d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                         metric='ggower', p1=5, p2=4, p3=2, random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([1, 1, 1, ..., 1, 0, 1])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[1, 1, 1, 1, 0, 0, 0, 0, 0, 1]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([965, 940], dtype=int64))
metric = 'RelMS'#
metric = 'relms'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05
)

fold_sample_dist_clust.fit(X)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.00e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-2.86e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.20e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-1.83e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.20e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-3.10e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-2.03e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    method='pamsil',
                                                    n_clusters=2,
                                                    random_state=123),
                         d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                         metric='relms', p1=5, p2=4, p3=2, random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([1, 1, 1, ..., 1, 0, 1])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[1, 1, 1, 1, 0, 0, 0, 0, 1, 1]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([1076,  829], dtype=int64))

KMedoids - pammedsil#

kmedoids_method = 'pammedsil'

clustering_method = KMedoids(
    n_clusters=n_clusters, 
    metric='precomputed', 
    method=kmedoids_method, 
    init='build', 
    max_iter=100, 
    random_state=123
)
metric = 'euclidean'#
metric = 'euclidean'

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
)

fold_sample_dist_clust.fit(X)
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    method='pammedsil',
                                                    n_clusters=2,
                                                    random_state=123),
                         metric='euclidean', random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([1, 1, 1, ..., 1, 1, 1])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([  19, 1886], dtype=int64))
metric = 'ggower'#
metric = 'ggower'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    meta_frac_sample_size=0.8,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05,
)
fold_sample_dist_clust.fit(X)
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    method='pammedsil',
                                                    n_clusters=2,
                                                    random_state=123),
                         d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                         metric='ggower', p1=5, p2=4, p3=2, random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([1877,   28], dtype=int64))
metric = 'RelMS'#
metric = 'relms'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05
)

fold_sample_dist_clust.fit(X)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.00e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-2.86e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.20e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-1.83e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-3.20e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d2 is not PSD (min eig=-3.10e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-2.03e+00). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\robust_mixed_dist\mixed.py:879: UserWarning: Gram matrix for d3 is not PSD (min eig=-1.08e-01). Transformation applied.
  warnings.warn(f'Gram matrix for d{i} is not PSD (min eig={eig_min_val:.2e}). Transformation applied.')
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    method='pammedsil',
                                                    n_clusters=2,
                                                    random_state=123),
                         d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                         metric='relms', p1=5, p2=4, p3=2, random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
fold_sample_dist_clust.labels_
array([0, 0, 0, ..., 0, 0, 0])
X_new = X[:10,:]

fold_sample_dist_clust.predict(X_new)
[0, 0, 0, 0, 1, 1, 1, 1, 1, 0]
np.unique(fold_sample_dist_clust.labels_, return_counts=True)
(array([0, 1]), array([1596,  309], dtype=int64))

db_robust_clust.plots#

clustering_MDS_plot_one_method#

from db_robust_clust.plots import clustering_MDS_plot_one_method
from sklearn.manifold import MDS
from robust_mixed_dist.mixed import generalized_gower_dist_matrix
import seaborn as sns
sns.set_style('whitegrid')
kmedoids_method = 'pam'
metric = 'ggower'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

clustering_method = KMedoids(
    n_clusters=n_clusters, 
    metric='precomputed', 
    method=kmedoids_method, 
    init='build', 
    max_iter=100, 
    random_state=123
)

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    meta_frac_sample_size=0.8,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05,
)

fold_sample_dist_clust.fit(X)
FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    method='pam', n_clusters=2,
                                                    random_state=123),
                         d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                         metric='ggower', p1=5, p2=4, p3=2, random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
mds = MDS(n_components=2, dissimilarity='precomputed', random_state=123) 
 
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

np.random.seed(123)
sample_idx = np.random.choice(range(X.shape[0]), 300)

D = generalized_gower_dist_matrix(
        X=X[sample_idx,:], p1=p1, p2=p2, p3=p2, 
        d1=d1, d2=d2, d3=d3, 
        robust_method='trimmed', alpha=0.05, epsilon=0.05, n_iters=20
    )

X_mds = mds.fit_transform(D)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\sklearn\manifold\_mds.py:744: FutureWarning: The default value of `n_init` will change from 4 to 1 in 1.9. To suppress this warning, provide some value of `n_init`.
  warnings.warn(
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\sklearn\manifold\_mds.py:754: FutureWarning: The default value of `init` will change from 'random' to 'classical_mds' in 1.10. To suppress this warning, provide some value of `init`.
  warnings.warn(
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\sklearn\manifold\_mds.py:771: FutureWarning: The `dissimilarity` parameter is deprecated and will be removed in 1.10. Use `metric` instead.
  warnings.warn(
clustering_MDS_plot_one_method(X_mds=X_mds, y_pred=fold_sample_dist_clust.labels_[sample_idx], 
                               y_true=None, title="MDS visualization of clustering results", 
                               accuracy=None, time=None, 
                               figsize=(8,7), bbox_to_anchor=(1,1), 
                               title_size=13, title_weight='bold', 
                               points_size=45, title_height=1, 
                               save=False, legend_size=9)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\seaborn\_core\data.py:313: UserWarning: Conversion using Arrow PyCapsule Interface failed due to missing PyArrow>=14 dependency, falling back to (deprecated) interchange protocol. We recommend that you install PyArrow>=14.0.0.
  return pd.api.interchange.from_dataframe(data)
_images/5d1f92ee60bf6920cbbc8b829a4be4690bd0b78515d79333824e5964341d0a02.png

clustering_MDS_plot_multiple_methods#

from db_robust_clust.plots import clustering_MDS_plot_multiple_methods
from db_robust_clust.metrics import adjusted_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
import time
###############################################################################

kmedoids_method = 'pam'
metric = 'ggower'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'
clustering_method = KMedoids(
    n_clusters=n_clusters, 
    metric='precomputed', 
    method=kmedoids_method, 
    init='build', 
    max_iter=100, 
    random_state=123
)
sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05
)
start_time = time.time()
sample_dist_clust.fit(X)
predicted_clusters_sample_dist_clust = sample_dist_clust.labels_
time_fast_kmedoids = time.time() - start_time

###############################################################################

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    meta_frac_sample_size=0.8,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05,
)
start_time = time.time()
fold_sample_dist_clust.fit(X=X) 
predicted_clusters_fold_sample_dist_clust = fold_sample_dist_clust.labels_
time_fold_fast_kmedoids = time.time() - start_time

###############################################################################

start_time = time.time()
kmeans = KMeans(n_clusters=n_clusters, random_state=123, init='k-means++', n_init='auto', max_iter=300)
kmeans.fit(X)
predicted_clusters_kmeans = kmeans.labels_
time_kmeans = time.time() - start_time

###############################################################################

start_time = time.time()
gmm = GaussianMixture(n_components=n_clusters, random_state=123)
gmm.fit(X)
predicted_clusters_gmm = gmm.predict(X)
time_gmm = time.time() - start_time

###############################################################################

start_time = time.time()
kmedoids = KMedoids(n_clusters=n_clusters, metric='euclidean', method='pam', init='build', max_iter=100, random_state=123)
kmedoids.fit(X)
predicted_clusters_kmedoids = kmedoids.predict(X)
time_kmedoids = time.time() - start_time

###############################################################################

adj_accuracy_sample_dist_clust, adj_predicted_clusters_sample_dist_clust = adjusted_score(y_pred=predicted_clusters_sample_dist_clust, y_true=y, metric=balanced_accuracy_score)
adj_accuracy_fold_sample_dist_clust, adj_predicted_clusters_fold_sample_dist_clust = adjusted_score(y_pred=predicted_clusters_fold_sample_dist_clust, y_true=y, metric=balanced_accuracy_score)
adj_accuracy_kmeans, adj_predicted_clusters_kmeans = adjusted_score(y_pred=predicted_clusters_kmeans, y_true=y, metric=balanced_accuracy_score)
adj_accuracy_gmm, adj_predicted_clusters_gmm = adjusted_score(y_pred=predicted_clusters_gmm, y_true=y, metric=balanced_accuracy_score)
adj_accuracy_kmedoids, adj_predicted_clusters_kmedoids = adjusted_score(y_pred=predicted_clusters_kmedoids, y_true=y, metric=balanced_accuracy_score)
y_pred_dict = {
    'SampleDistClust-RobustGGower-KMedoidsPAM': adj_predicted_clusters_sample_dist_clust[sample_idx],
    'FoldSampleDistClust-RobustGGower-KMedoidsPAM': adj_predicted_clusters_fold_sample_dist_clust[sample_idx],
    'Kmeans': adj_predicted_clusters_kmeans[sample_idx],
    'GMM': adj_predicted_clusters_gmm[sample_idx],
    'Kmedoids': adj_predicted_clusters_kmedoids[sample_idx]
}

accuracy_dict = {
    'SampleDistClust-RobustGGower-KMedoidsPAM': adj_accuracy_sample_dist_clust,
    'FoldSampleDistClust-RobustGGower-KMedoidsPAM': adj_accuracy_fold_sample_dist_clust,
    'Kmeans': adj_accuracy_kmeans,
    'GMM': adj_accuracy_gmm,
    'Kmedoids': adj_accuracy_kmedoids,
}

time_dict = {
    'SampleDistClust-RobustGGower-KMedoidsPAM': time_fast_kmedoids,
    'FoldSampleDistClust-RobustGGower-KMedoidsPAM': time_fold_fast_kmedoids,
    'Kmeans': time_kmeans,
    'GMM': time_gmm,
    'Kmedoids': time_kmedoids,
}
clustering_MDS_plot_multiple_methods(X_mds=X_mds, y_pred=y_pred_dict, 
                                     y_true=y[sample_idx],
                                     title="MDS visualization of clustering results", 
                                     accuracy=accuracy_dict, time=time_dict, n_rows=2,
                                     figsize=(15,10), bbox_to_anchor=(0.68,-1.9), 
                                     title_size=13, subtitles_size=10,
                                     title_weight='bold', points_size=45, 
                                     title_height=0.98, legend_size=8, 
                                     wspace=0.25, hspace=0.45, 
                                     legend_title='Cluster Labels',
                                     n_cols_legend=4, save=False)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\seaborn\_core\data.py:313: UserWarning: Conversion using Arrow PyCapsule Interface failed due to missing PyArrow>=14 dependency, falling back to (deprecated) interchange protocol. We recommend that you install PyArrow>=14.0.0.
  return pd.api.interchange.from_dataframe(data)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\seaborn\_core\data.py:313: UserWarning: Conversion using Arrow PyCapsule Interface failed due to missing PyArrow>=14 dependency, falling back to (deprecated) interchange protocol. We recommend that you install PyArrow>=14.0.0.
  return pd.api.interchange.from_dataframe(data)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\seaborn\_core\data.py:313: UserWarning: Conversion using Arrow PyCapsule Interface failed due to missing PyArrow>=14 dependency, falling back to (deprecated) interchange protocol. We recommend that you install PyArrow>=14.0.0.
  return pd.api.interchange.from_dataframe(data)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\seaborn\_core\data.py:313: UserWarning: Conversion using Arrow PyCapsule Interface failed due to missing PyArrow>=14 dependency, falling back to (deprecated) interchange protocol. We recommend that you install PyArrow>=14.0.0.
  return pd.api.interchange.from_dataframe(data)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\seaborn\_core\data.py:313: UserWarning: Conversion using Arrow PyCapsule Interface failed due to missing PyArrow>=14 dependency, falling back to (deprecated) interchange protocol. We recommend that you install PyArrow>=14.0.0.
  return pd.api.interchange.from_dataframe(data)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\seaborn\_core\data.py:313: UserWarning: Conversion using Arrow PyCapsule Interface failed due to missing PyArrow>=14 dependency, falling back to (deprecated) interchange protocol. We recommend that you install PyArrow>=14.0.0.
  return pd.api.interchange.from_dataframe(data)
_images/540950c13cb5d3acf7182e9adb46fadca85e7640598e3ff46aceec79280b6498.png