Quick Start

Quick Start#

Imports and Paths#

import os, sys
import numpy as np


script_path = os.getcwd()
project_path = os.path.join(script_path, '..')
sys.path.append(project_path)

Installation#

# pip install db-robust-clust

To see the available versions of the package go to the release history at PyPi: https://pypi.org/project/db_robust_clust/#history

Data#

Variable	Description	Variable Type	Possible Categories / Range
latitude	Latitude of the house	Quantitative	24.86 - 25.27
longitude	Longitude of the house	Quantitative	55.06 - 55.44
price	Market price of the house	Quantitative	220000 - 35000000
price_per_sqft	Price per square foot	Quantitative	361.87 - 4805.87
size in sqft	Size in square feet	Quantitative	294 - 9576
no of bedrooms	Number of bedrooms in the house	Multiclass	0, 1, 2, 3, 4, 5
no of bathrooms	Number of bathrooms in the house	Multiclass	0, 1, 2, 3, 4, 5, 6
quality	Quality level of the house (response variable)	Binary	Low (0), Medium-High-UltraHigh (1)
balcony	Indicates if the house has a balcony	Binary	true (1), false (0)
barbecue area	Indicates if the house has a barbecue area	Binary	true (1), false (0)
private pool	Indicates if the house has a private pool	Binary	true (1), false (0)
private garden	Indicates if the house has a private garden	Binary	true (1), false (0)

Note: The quality variable is the response (target) variable. The remaining variables are mixed-type predictors.

import polars as pl

data_url = "https://raw.githubusercontent.com/FabioScielzoOrtiz/db_robust_clust-docu/refs/heads/main/data/dubai_houses_processed.csv"
df = pl.read_csv(data_url)

response = 'quality'
quant_predictors = ['latitude', 'longitude', 'price', 'size_in_sqft', 'price_per_sqft']
binary_predictors = ['balcony', 'barbecue_area', 'private_pool', 'private_garden']
multiclass_predictors = ['no_of_bedrooms', 'no_of_bathrooms']

y = df[response]
X = df[quant_predictors + binary_predictors + multiclass_predictors]

y.head()

shape: (10,)

quality
i64
1
1
1
0
1
1
1
1
0
1

X.head()

shape: (5, 11)

latitude	longitude	price	size_in_sqft	price_per_sqft	balcony	barbecue_area	private_pool	private_garden	no_of_bedrooms	no_of_bathrooms
f64	f64	i64	i64	f64	i64	i64	i64	i64	i64	i64
25.113208	55.138932	2700000	1079	2502.32	1	1	0	0	1	2
25.106809	55.151201	2850000	1582	1801.52	1	0	0	0	2	2
25.063302	55.137728	1150000	1951	589.44	1	0	0	0	3	5
25.227295	55.341761	2850000	2020	1410.89	1	0	0	0	2	3
25.114275	55.139764	1729200	507	3410.65	0	0	0	0	0	1

n = len(X)
raw_weights = np.random.rand(n)
simulated_weights = raw_weights / np.sum(raw_weights)

`db_robust_clust.models`#

p1 = len(quant_predictors)
p2 = len(binary_predictors)
p3 = len(multiclass_predictors)

n_clusters = len(y.unique())

`db_robust_clust.plots`#

`clustering_MDS_plot_one_method`#

from db_robust_clust.plots import clustering_MDS_plot_one_method
from sklearn.manifold import MDS
from robust_mixed_dist.mixed import generalized_gower_dist_matrix
import seaborn as sns
sns.set_style('whitegrid')

kmedoids_method = 'pam'
metric = 'ggower'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

clustering_method = KMedoids(
    n_clusters=n_clusters, 
    metric='precomputed', 
    method=kmedoids_method, 
    init='build', 
    max_iter=100, 
    random_state=123
)

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    meta_frac_sample_size=0.8,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05,
)

fold_sample_dist_clust.fit(X)

FoldSampleDistClustering(clustering_method=KMedoids(init='build', max_iter=100,
                                                    method='pam', n_clusters=2,
                                                    random_state=123),
                         d1='robust_mahalanobis', d2='jaccard', d3='hamming',
                         metric='ggower', p1=5, p2=4, p3=2, random_state=123)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

FoldSampleDistClustering

iFitted

Parameters

	clustering_method	KMedoids(init...dom_state=123)
	metric	'ggower'
	n_splits	5
	shuffle	True
	random_state	123
	stratify	False
	frac_sample_size	0.1
	meta_frac_sample_size	0.8
	p1	5
	p2	4
	p3	2
	d1	'robust_mahalanobis'
	d2	'jaccard'
	d3	'hamming'
	q	1
	robust_method	'trimmed'
	alpha	0.05

clustering_method: KMedoids

KMedoids(init='build', max_iter=100, method='pam', n_clusters=2,
         random_state=123)

KMedoids

Parameters

	n_clusters	2
	metric	'precomputed'
	metric_params	None
	method	'pam'
	init	'build'
	max_iter	100
	random_state	123

mds = MDS(n_components=2, dissimilarity='precomputed', random_state=123) 
 
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

np.random.seed(123)
sample_idx = np.random.choice(range(X.shape[0]), 300)

D = generalized_gower_dist_matrix(
        X=X[sample_idx,:], p1=p1, p2=p2, p3=p2, 
        d1=d1, d2=d2, d3=d3, 
        robust_method='trimmed', alpha=0.05, epsilon=0.05, n_iters=20
    )

X_mds = mds.fit_transform(D)

c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\sklearn\manifold\_mds.py:744: FutureWarning: The default value of `n_init` will change from 4 to 1 in 1.9. To suppress this warning, provide some value of `n_init`.
  warnings.warn(
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\sklearn\manifold\_mds.py:754: FutureWarning: The default value of `init` will change from 'random' to 'classical_mds' in 1.10. To suppress this warning, provide some value of `init`.
  warnings.warn(
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\sklearn\manifold\_mds.py:771: FutureWarning: The `dissimilarity` parameter is deprecated and will be removed in 1.10. Use `metric` instead.
  warnings.warn(

clustering_MDS_plot_one_method(X_mds=X_mds, y_pred=fold_sample_dist_clust.labels_[sample_idx], 
                               y_true=None, title="MDS visualization of clustering results", 
                               accuracy=None, time=None, 
                               figsize=(8,7), bbox_to_anchor=(1,1), 
                               title_size=13, title_weight='bold', 
                               points_size=45, title_height=1, 
                               save=False, legend_size=9)

c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\seaborn\_core\data.py:313: UserWarning: Conversion using Arrow PyCapsule Interface failed due to missing PyArrow>=14 dependency, falling back to (deprecated) interchange protocol. We recommend that you install PyArrow>=14.0.0.
  return pd.api.interchange.from_dataframe(data)

_images/5d1f92ee60bf6920cbbc8b829a4be4690bd0b78515d79333824e5964341d0a02.png

`clustering_MDS_plot_multiple_methods`#

from db_robust_clust.plots import clustering_MDS_plot_multiple_methods
from db_robust_clust.metrics import adjusted_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
import time

###############################################################################

kmedoids_method = 'pam'
metric = 'ggower'
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'
clustering_method = KMedoids(
    n_clusters=n_clusters, 
    metric='precomputed', 
    method=kmedoids_method, 
    init='build', 
    max_iter=100, 
    random_state=123
)
sample_dist_clust = SampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    frac_sample_size=0.1,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05
)
start_time = time.time()
sample_dist_clust.fit(X)
predicted_clusters_sample_dist_clust = sample_dist_clust.labels_
time_fast_kmedoids = time.time() - start_time

###############################################################################

fold_sample_dist_clust = FoldSampleDistClustering(
    clustering_method = clustering_method,
    metric = metric,
    n_splits=5, 
    shuffle=True,
    frac_sample_size=0.1,
    meta_frac_sample_size=0.8,
    random_state=123,
    stratify=False,
    p1=p1, p2=p2, p3=p3,
    d1=d1, d2=d2, d3=d3, 
    robust_method='trimmed', alpha=0.05,
)
start_time = time.time()
fold_sample_dist_clust.fit(X=X) 
predicted_clusters_fold_sample_dist_clust = fold_sample_dist_clust.labels_
time_fold_fast_kmedoids = time.time() - start_time

###############################################################################

start_time = time.time()
kmeans = KMeans(n_clusters=n_clusters, random_state=123, init='k-means++', n_init='auto', max_iter=300)
kmeans.fit(X)
predicted_clusters_kmeans = kmeans.labels_
time_kmeans = time.time() - start_time

###############################################################################

start_time = time.time()
gmm = GaussianMixture(n_components=n_clusters, random_state=123)
gmm.fit(X)
predicted_clusters_gmm = gmm.predict(X)
time_gmm = time.time() - start_time

###############################################################################

start_time = time.time()
kmedoids = KMedoids(n_clusters=n_clusters, metric='euclidean', method='pam', init='build', max_iter=100, random_state=123)
kmedoids.fit(X)
predicted_clusters_kmedoids = kmedoids.predict(X)
time_kmedoids = time.time() - start_time

###############################################################################

adj_accuracy_sample_dist_clust, adj_predicted_clusters_sample_dist_clust = adjusted_score(y_pred=predicted_clusters_sample_dist_clust, y_true=y, metric=balanced_accuracy_score)
adj_accuracy_fold_sample_dist_clust, adj_predicted_clusters_fold_sample_dist_clust = adjusted_score(y_pred=predicted_clusters_fold_sample_dist_clust, y_true=y, metric=balanced_accuracy_score)
adj_accuracy_kmeans, adj_predicted_clusters_kmeans = adjusted_score(y_pred=predicted_clusters_kmeans, y_true=y, metric=balanced_accuracy_score)
adj_accuracy_gmm, adj_predicted_clusters_gmm = adjusted_score(y_pred=predicted_clusters_gmm, y_true=y, metric=balanced_accuracy_score)
adj_accuracy_kmedoids, adj_predicted_clusters_kmedoids = adjusted_score(y_pred=predicted_clusters_kmedoids, y_true=y, metric=balanced_accuracy_score)

y_pred_dict = {
    'SampleDistClust-RobustGGower-KMedoidsPAM': adj_predicted_clusters_sample_dist_clust[sample_idx],
    'FoldSampleDistClust-RobustGGower-KMedoidsPAM': adj_predicted_clusters_fold_sample_dist_clust[sample_idx],
    'Kmeans': adj_predicted_clusters_kmeans[sample_idx],
    'GMM': adj_predicted_clusters_gmm[sample_idx],
    'Kmedoids': adj_predicted_clusters_kmedoids[sample_idx]
}

accuracy_dict = {
    'SampleDistClust-RobustGGower-KMedoidsPAM': adj_accuracy_sample_dist_clust,
    'FoldSampleDistClust-RobustGGower-KMedoidsPAM': adj_accuracy_fold_sample_dist_clust,
    'Kmeans': adj_accuracy_kmeans,
    'GMM': adj_accuracy_gmm,
    'Kmedoids': adj_accuracy_kmedoids,
}

time_dict = {
    'SampleDistClust-RobustGGower-KMedoidsPAM': time_fast_kmedoids,
    'FoldSampleDistClust-RobustGGower-KMedoidsPAM': time_fold_fast_kmedoids,
    'Kmeans': time_kmeans,
    'GMM': time_gmm,
    'Kmedoids': time_kmedoids,
}

clustering_MDS_plot_multiple_methods(X_mds=X_mds, y_pred=y_pred_dict, 
                                     y_true=y[sample_idx],
                                     title="MDS visualization of clustering results", 
                                     accuracy=accuracy_dict, time=time_dict, n_rows=2,
                                     figsize=(15,10), bbox_to_anchor=(0.68,-1.9), 
                                     title_size=13, subtitles_size=10,
                                     title_weight='bold', points_size=45, 
                                     title_height=0.98, legend_size=8, 
                                     wspace=0.25, hspace=0.45, 
                                     legend_title='Cluster Labels',
                                     n_cols_legend=4, save=False)

c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\seaborn\_core\data.py:313: UserWarning: Conversion using Arrow PyCapsule Interface failed due to missing PyArrow>=14 dependency, falling back to (deprecated) interchange protocol. We recommend that you install PyArrow>=14.0.0.
  return pd.api.interchange.from_dataframe(data)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\seaborn\_core\data.py:313: UserWarning: Conversion using Arrow PyCapsule Interface failed due to missing PyArrow>=14 dependency, falling back to (deprecated) interchange protocol. We recommend that you install PyArrow>=14.0.0.
  return pd.api.interchange.from_dataframe(data)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\seaborn\_core\data.py:313: UserWarning: Conversion using Arrow PyCapsule Interface failed due to missing PyArrow>=14 dependency, falling back to (deprecated) interchange protocol. We recommend that you install PyArrow>=14.0.0.
  return pd.api.interchange.from_dataframe(data)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\seaborn\_core\data.py:313: UserWarning: Conversion using Arrow PyCapsule Interface failed due to missing PyArrow>=14 dependency, falling back to (deprecated) interchange protocol. We recommend that you install PyArrow>=14.0.0.
  return pd.api.interchange.from_dataframe(data)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\seaborn\_core\data.py:313: UserWarning: Conversion using Arrow PyCapsule Interface failed due to missing PyArrow>=14 dependency, falling back to (deprecated) interchange protocol. We recommend that you install PyArrow>=14.0.0.
  return pd.api.interchange.from_dataframe(data)
c:\Users\fscielzo\Documents\Proyectos\db_robust_clust-package\.venv\Lib\site-packages\seaborn\_core\data.py:313: UserWarning: Conversion using Arrow PyCapsule Interface failed due to missing PyArrow>=14 dependency, falling back to (deprecated) interchange protocol. We recommend that you install PyArrow>=14.0.0.
  return pd.api.interchange.from_dataframe(data)

_images/540950c13cb5d3acf7182e9adb46fadca85e7640598e3ff46aceec79280b6498.png

Quick Start

Contents

Quick Start#

Imports and Paths#

Installation#

Data#

db_robust_clust.models#

SampleDistClustering#

KMedoids - pam#

metric = 'euclidean'#

metric = 'ggower'#

metric = 'RelMS'#

KMedoids - fasterpam#

metric = 'euclidean'#

metric = 'ggower'#

metric = 'RelMS'#

KMedoids - alternate#

metric = 'euclidean'#

metric = 'ggower'#

metric = 'RelMS'#

KMedoids - fastermsc#

metric = 'euclidean'#

metric = 'ggower'#

metric = 'RelMS'#

KMedoids - pamsil#

metric = 'euclidean'#

metric = 'ggower'#

metric = 'RelMS'#

KMedoids - pammedsil#

metric = 'euclidean'#

metric = 'ggower'#

metric = 'RelMS'#

FoldSampleDistClustering#

KMedoids - pam#

metric = 'euclidean'#

metric = 'ggower'#

metric = 'RelMS'#

KMedoids - fasterpam#

metric = 'euclidean'#

metric = 'ggower'#

metric = 'RelMS'#

KMedoids - alternate#

metric = 'euclidean'#

metric = 'ggower'#

metric = 'RelMS'#

KMedoids - fastermsc#

metric = 'euclidean'#

metric = 'ggower'#

metric = 'RelMS'#

KMedoids - pamsil#

metric = 'euclidean'#

metric = 'ggower'#

metric = 'RelMS'#

KMedoids - pammedsil#

metric = 'euclidean'#

metric = 'ggower'#

metric = 'RelMS'#

db_robust_clust.plots#

clustering_MDS_plot_one_method#

clustering_MDS_plot_multiple_methods#

`db_robust_clust.models`#

`SampleDistClustering`#

`KMedoids - pam`#

`metric = 'euclidean'`#

`metric = 'ggower'`#

`metric = 'RelMS'`#

`KMedoids - fasterpam`#

`metric = 'euclidean'`#

`metric = 'ggower'`#

`metric = 'RelMS'`#

`KMedoids - alternate`#

`metric = 'euclidean'`#

`metric = 'ggower'`#

`metric = 'RelMS'`#

`KMedoids - fastermsc`#

`metric = 'euclidean'`#

`metric = 'ggower'`#

`metric = 'RelMS'`#

`KMedoids - pamsil`#

`metric = 'euclidean'`#

`metric = 'ggower'`#

`metric = 'RelMS'`#

`KMedoids - pammedsil`#

`metric = 'euclidean'`#

`metric = 'ggower'`#

`metric = 'RelMS'`#

`FoldSampleDistClustering`#

`KMedoids - pam`#

`metric = 'euclidean'`#

`metric = 'ggower'`#

`metric = 'RelMS'`#

`KMedoids - fasterpam`#

`metric = 'euclidean'`#

`metric = 'ggower'`#

`metric = 'RelMS'`#

`KMedoids - alternate`#

`metric = 'euclidean'`#

`metric = 'ggower'`#

`metric = 'RelMS'`#

`KMedoids - fastermsc`#

`metric = 'euclidean'`#

`metric = 'ggower'`#

`metric = 'RelMS'`#

`KMedoids - pamsil`#

`metric = 'euclidean'`#

`metric = 'ggower'`#

`metric = 'RelMS'`#

`KMedoids - pammedsil`#

`metric = 'euclidean'`#

`metric = 'ggower'`#

`metric = 'RelMS'`#

`db_robust_clust.plots`#

`clustering_MDS_plot_one_method`#

`clustering_MDS_plot_multiple_methods`#