Quick Start#

Imports#

import polars as pl
import numpy as np

Installation#

# pip install robust-mixed-dist

To see the available versions of the package go to the release history at PyPi: https://pypi.org/project/robust_mixed_dist/#history

Data#

Variable

Description

Variable Type

Possible Categories / Range

latitude

Latitude of the house

Quantitative

24.86 - 25.27

longitude

Longitude of the house

Quantitative

55.06 - 55.44

price

Market price of the house

Quantitative

220000 - 35000000

price_per_sqft

Price per square foot

Quantitative

361.87 - 4805.87

size in sqft

Size in square feet

Quantitative

294 - 9576

no of bedrooms

Number of bedrooms in the house

Multiclass

0, 1, 2, 3, 4, 5

no of bathrooms

Number of bathrooms in the house

Multiclass

0, 1, 2, 3, 4, 5, 6

quality

Quality level of the house (response variable)

Binary

Low (0), Medium-High-UltraHigh (1)

balcony

Indicates if the house has a balcony

Binary

true (1), false (0)

barbecue area

Indicates if the house has a barbecue area

Binary

true (1), false (0)

private pool

Indicates if the house has a private pool

Binary

true (1), false (0)

private garden

Indicates if the house has a private garden

Binary

true (1), false (0)

data_url = "https://raw.githubusercontent.com/FabioScielzoOrtiz/robust_mixed_dist-docu/refs/heads/main/data/dubai_houses_processed.csv"
df = pl.read_csv(data_url)

quant_variables = ['latitude', 'longitude', 'price', 'size_in_sqft', 'price_per_sqft']
binary_variables = ['quality', 'balcony', 'barbecue_area', 'private_pool', 'private_garden']
multiclass_variables = ['no_of_bedrooms', 'no_of_bathrooms']

X = df[quant_variables + binary_variables + multiclass_variables]

p1 = len(quant_variables)
p2 = len(binary_variables)
p3 = len(multiclass_variables)
X.head()
shape: (5, 12)
latitudelongitudepricesize_in_sqftprice_per_sqftqualitybalconybarbecue_areaprivate_poolprivate_gardenno_of_bedroomsno_of_bathrooms
f64f64i64i64f64i64i64i64i64i64i64i64
25.11320855.138932270000010792502.321110012
25.10680955.151201285000015821801.521100022
25.06330255.13772811500001951589.441100035
25.22729555.341761285000020201410.890100023
25.11427555.13976417292005073410.651000001
n = len(X)
raw_weights = np.random.rand(n)
simulated_weights = raw_weights / np.sum(raw_weights)

robust_mixed_dist.mixed#

generalized_gower_dist_matrix#

from robust_mixed_dist.mixed import generalized_gower_dist_matrix
d1 = 'euclidean'
d2 = 'sokal'
d3 = 'hamming'

D = generalized_gower_dist_matrix(
        X=X, p1=p1, p2=p2, p3=p2, 
        d1=d1, d2=d2, d3=d3, 
        weights=None
    )

D
array([[0.        , 1.41503568, 2.14046114, ..., 2.07437985, 2.02688788,
        0.66580485],
       [1.41503568, 0.        , 1.84542512, ..., 1.75076502, 1.59918112,
        1.5856242 ],
       [2.14046114, 1.84542512, 0.        , ..., 0.60087177, 2.07964796,
        2.07754328],
       ...,
       [2.07437985, 1.75076502, 0.60087177, ..., 0.        , 2.20952014,
        2.1994978 ],
       [2.02688788, 1.59918112, 2.07964796, ..., 2.20952014, 0.        ,
        1.9041296 ],
       [0.66580485, 1.5856242 , 2.07754328, ..., 2.1994978 , 1.9041296 ,
        0.        ]], shape=(1905, 1905))
d1 = 'euclidean'
d2 = 'sokal'
d3 = 'hamming'

weights = simulated_weights

D = generalized_gower_dist_matrix(
        X=X, p1=p1, p2=p2, p3=p2, 
        d1=d1, d2=d2, d3=d3, 
        weights=weights
    )

D
array([[0.        , 1.40622841, 2.13589776, ..., 2.06477834, 2.02102281,
        0.68955799],
       [1.40622841, 0.        , 1.84669396, ..., 1.74503306, 1.60400845,
        1.58950663],
       [2.13589776, 1.84669396, 0.        , ..., 0.62230837, 2.07045472,
        2.06818707],
       ...,
       [2.06477834, 1.74503306, 0.62230837, ..., 0.        , 2.21004103,
        2.19929159],
       [2.02102281, 1.60400845, 2.07045472, ..., 2.21004103, 0.        ,
        1.88862973],
       [0.68955799, 1.58950663, 2.06818707, ..., 2.19929159, 1.88862973,
        0.        ]], shape=(1905, 1905))
d1 = 'minkowski'
d2 = 'sokal'
d3 = 'hamming'

D = generalized_gower_dist_matrix(
        X=X, p1=p1, p2=p2, p3=p2, 
        d1=d1, d2=d2, d3=d3, 
        q=1
    )

D
array([[0.        , 1.41504995, 2.14064286, ..., 2.07439996, 2.02701417,
        0.66615139],
       [1.41504995, 0.        , 1.8455187 , ..., 1.75076714, 1.5993112 ,
        1.58574048],
       [2.14064286, 1.8455187 , 0.        , ..., 0.60093846, 2.07968384,
        2.07757304],
       ...,
       [2.07439996, 1.75076714, 0.60093846, ..., 0.        , 2.20961978,
        2.19958686],
       [2.02701417, 1.5993112 , 2.07968384, ..., 2.20961978, 0.        ,
        1.90412993],
       [0.66615139, 1.58574048, 2.07757304, ..., 2.19958686, 1.90412993,
        0.        ]], shape=(1905, 1905))
d1 = 'canberra'
d2 = 'sokal'
d3 = 'hamming'

D = generalized_gower_dist_matrix(
        X=X, p1=p1, p2=p2, p3=p2, 
        d1=d1, d2=d2, d3=d3
    )

D
array([[0.        , 1.51698365, 2.81057962, ..., 2.23259929, 2.63105196,
        1.67845893],
       [1.51698365, 0.        , 2.30710265, ..., 1.77496107, 2.36500717,
        2.25919371],
       [2.81057962, 2.30710265, 0.        , ..., 1.27280892, 2.46948878,
        2.41664979],
       ...,
       [2.23259929, 1.77496107, 1.27280892, ..., 0.        , 2.81661747,
        2.73099007],
       [2.63105196, 2.36500717, 2.46948878, ..., 2.81661747, 0.        ,
        1.91188048],
       [1.67845893, 2.25919371, 2.41664979, ..., 2.73099007, 1.91188048,
        0.        ]], shape=(1905, 1905))
d1 = 'pearson'
d2 = 'sokal'
d3 = 'hamming'

D = generalized_gower_dist_matrix(
        X=X, p1=p1, p2=p2, p3=p2, 
        d1=d1, d2=d2, d3=d3, 
    )

D
array([[0.        , 1.51402981, 2.51238026, ..., 2.6775671 , 2.30460266,
        1.12050558],
       [1.51402981, 0.        , 1.98226696, ..., 2.28716057, 1.7206908 ,
        1.61863728],
       [2.51238026, 1.98226696, 0.        , ..., 1.81898487, 2.25477183,
        2.17232953],
       ...,
       [2.6775671 , 2.28716057, 1.81898487, ..., 0.        , 2.52705657,
        2.68755742],
       [2.30460266, 1.7206908 , 2.25477183, ..., 2.52705657, 0.        ,
        1.96638818],
       [1.12050558, 1.61863728, 2.17232953, ..., 2.68755742, 1.96638818,
        0.        ]], shape=(1905, 1905))
d1 = 'mahalanobis'
d2 = 'sokal'
d3 = 'hamming'

D = generalized_gower_dist_matrix(
        X=X, p1=p1, p2=p2, p3=p2, 
        d1=d1, d2=d2, d3=d3, 
    )

D
array([[0.        , 1.66836681, 2.9344599 , ..., 2.43712289, 2.6009022 ,
        1.71120297],
       [1.66836681, 0.        , 2.13804234, ..., 1.80120445, 1.81351956,
        1.76764702],
       [2.9344599 , 2.13804234, 0.        , ..., 0.84835547, 2.40741939,
        2.38131804],
       ...,
       [2.43712289, 1.80120445, 0.84835547, ..., 0.        , 2.36869851,
        2.33260135],
       [2.6009022 , 1.81351956, 2.40741939, ..., 2.36869851, 0.        ,
        1.90552695],
       [1.71120297, 1.76764702, 2.38131804, ..., 2.33260135, 1.90552695,
        0.        ]], shape=(1905, 1905))
d1 = 'robust_mahalanobis'
d2 = 'sokal'
d3 = 'hamming'

D = generalized_gower_dist_matrix(
        X=X, p1=p1, p2=p2, p3=p2, 
        d1=d1, d2=d2, d3=d3, 
        robust_method='trimmed', alpha=0.05, epsilon=0.05, n_iters=20
    )

D
array([[0.        , 1.64946389, 2.79008972, ..., 2.36223596, 2.87529178,
        2.06208371],
       [1.64946389, 0.        , 2.04187428, ..., 1.78060429, 2.02786949,
        1.95803076],
       [2.79008972, 2.04187428, 0.        , ..., 0.77870059, 2.3599971 ,
        2.32341031],
       ...,
       [2.36223596, 1.78060429, 0.77870059, ..., 0.        , 2.50576595,
        2.45118981],
       [2.87529178, 2.02786949, 2.3599971 , ..., 2.50576595, 0.        ,
        1.90657364],
       [2.06208371, 1.95803076, 2.32341031, ..., 2.45118981, 1.90657364,
        0.        ]], shape=(1905, 1905))
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

D = generalized_gower_dist_matrix(
        X=X, p1=p1, p2=p2, p3=p2, 
        d1=d1, d2=d2, d3=d3, 
        robust_method='trimmed', alpha=0.05, epsilon=0.05, n_iters=20
    )

D
# Time: 0.2 secs
array([[0.        , 1.41733191, 2.6594547 , ..., 2.20641278, 2.41162867,
        2.06208371],
       [1.41733191, 0.        , 2.04187428, ..., 1.78060429, 1.84400469,
        1.76691343],
       [2.6594547 , 2.04187428, 0.        , ..., 0.77870059, 2.20401564,
        2.16479423],
       ...,
       [2.20641278, 1.78060429, 0.77870059, ..., 0.        , 2.35944095,
        2.30139742],
       [2.41162867, 1.84400469, 2.20401564, ..., 2.35944095, 0.        ,
        1.08796741],
       [2.06208371, 1.76691343, 2.16479423, ..., 2.30139742, 1.08796741,
        0.        ]], shape=(1905, 1905))
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

weights = simulated_weights

D = generalized_gower_dist_matrix(
        X=X, p1=p1, p2=p2, p3=p2, 
        d1=d1, d2=d2, d3=d3, 
        robust_method='trimmed', alpha=0.05, epsilon=0.05, n_iters=20, weights=weights
    )

D
array([[0.        , 1.22346879, 2.20797324, ..., 2.01505549, 1.22110179,
        0.58290575],
       [1.22346879, 0.        , 1.92665926, ..., 1.76188422, 1.2039458 ,
        1.20253553],
       [2.20797324, 1.92665926, 0.        , ..., 0.64911354, 1.97428359,
        1.9726795 ],
       ...,
       [2.01505549, 1.76188422, 0.64911354, ..., 0.        , 1.94068   ,
        1.93898147],
       [1.22110179, 1.2039458 , 1.97428359, ..., 1.94068   , 0.        ,
        1.07482551],
       [0.58290575, 1.20253553, 1.9726795 , ..., 1.93898147, 1.07482551,
        0.        ]], shape=(1905, 1905))
D, D1, D2, D3 = generalized_gower_dist_matrix(
        X=X, p1=p1, p2=p2, p3=p2, 
        d1=d1, d2=d2, d3=d3, 
        robust_method='trimmed', alpha=0.05, epsilon=0.05, n_iters=20, weights=weights,
        return_combined_distances=True
    )
D1
array([[  0.        ,  73.42302734, 178.72422328, ..., 110.36938173,
         90.26428032,  90.7829134 ],
       [ 73.42302734,   0.        , 127.21288122, ...,  37.96138528,
         65.12649349,  64.49140866],
       [178.72422328, 127.21288122,   0.        , ..., 101.0942474 ,
         90.7532933 ,  89.90323136],
       ...,
       [110.36938173,  37.96138528, 101.0942474 , ...,   0.        ,
         71.0293845 ,  69.89518059],
       [ 90.26428032,  65.12649349,  90.7532933 , ...,  71.0293845 ,
          0.        ,   1.28659351],
       [ 90.7829134 ,  64.49140866,  89.90323136, ...,  69.89518059,
          1.28659351,   0.        ]], shape=(1905, 1905))
D2
array([[0.        , 0.33333333, 0.33333333, ..., 0.33333333, 0.5       ,
        0.        ],
       [0.33333333, 0.        , 0.        , ..., 0.        , 0.33333333,
        0.33333333],
       [0.33333333, 0.        , 0.        , ..., 0.        , 0.33333333,
        0.33333333],
       ...,
       [0.33333333, 0.        , 0.        , ..., 0.        , 0.33333333,
        0.33333333],
       [0.5       , 0.33333333, 0.33333333, ..., 0.33333333, 0.        ,
        0.5       ],
       [0.        , 0.33333333, 0.33333333, ..., 0.33333333, 0.5       ,
        0.        ]], shape=(1905, 1905))
D3
array([[0. , 0.5, 1. , ..., 1. , 0. , 0. ],
       [0.5, 0. , 1. , ..., 1. , 0.5, 0.5],
       [1. , 1. , 0. , ..., 0. , 1. , 1. ],
       ...,
       [1. , 1. , 0. , ..., 0. , 1. , 1. ],
       [0. , 0.5, 1. , ..., 1. , 0. , 0. ],
       [0. , 0.5, 1. , ..., 1. , 0. , 0. ]], shape=(1905, 1905))

generalized_gower_dist#

from robust_mixed_dist.mixed import generalized_gower_dist, compute_geometric_var
from robust_mixed_dist.quantitative import S_robust
xi = X[0,:]
xr = X[2,:]
d1 = 'euclidean'
d2 = 'sokal'
d3 = 'hamming'

gv1, gv2, gv3 = compute_geometric_var(
        X=X, 
        p1=p1, p2=p2, p3=p3, 
        d1=d1, d2=d2, d3=d3, 
    )

d = generalized_gower_dist(
    xi=xi, xr=xr, 
    p1=p1, p2=p2, p3=p3, 
    d1=d1, d2=d2, d3=d3,  
    q=1, S=None, 
    geom_var_1=gv1, 
    geom_var_2=gv2, 
    geom_var_3=gv3
)

d
np.float64(2.1404611423448294)
d1 = 'euclidean'
d2 = 'sokal'
d3 = 'hamming'

weights = simulated_weights

gv1, gv2, gv3 = compute_geometric_var(
        X=X, 
        p1=p1, p2=p2, p3=p3, 
        d1=d1, d2=d2, d3=d3, 
        weights=weights
    )

d = generalized_gower_dist(
    xi=xi, xr=xr, 
    p1=p1, p2=p2, p3=p3, 
    d1=d1, d2=d2, d3=d3,  
    q=1, S=None, 
    geom_var_1=gv1, 
    geom_var_2=gv2, 
    geom_var_3=gv3
)

d
np.float64(2.135897759374307)
d1 = 'mahalanobis'
d2 = 'sokal'
d3 = 'hamming'

gv1, gv2, gv3 = compute_geometric_var(
        X=X, 
        p1=p1, p2=p2, p3=p3, 
        d1=d1, d2=d2, d3=d3
        )

X_quant = X[:, :p1]
S_estimation = np.cov(X_quant, rowvar=False)

d = generalized_gower_dist(
    xi=xi, xr=xr, 
    p1=p1, p2=p2, p3=p3, 
    d1=d1, d2=d2, d3=d3,  
    S=S_estimation, 
    geom_var_1=gv1, 
    geom_var_2=gv2, 
    geom_var_3=gv3
)

d
np.float64(2.9588226438900027)
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

gv1, gv2, gv3 = compute_geometric_var(
        X=X, 
        p1=p1, p2=p2, p3=p3, 
        d1=d1, d2=d2, d3=d3, 
        robust_method='trimmed', epsilon=0.05, alpha=0.05, n_iters=20
    )

X_quant = X[:, :p1]
S_estimation = S_robust(X=X_quant, method='trimmed', epsilon=0.05, alpha=0.05, n_iters=20)

d = generalized_gower_dist(
    xi=xi, xr=xr, 
    p1=p1, p2=p2, p3=p3, 
    d1=d1, d2=d2, d3=d3,  
    S=S_estimation, 
    geom_var_1=gv1, 
    geom_var_2=gv2, 
    geom_var_3=gv3
)

d
np.float64(2.6594546971734747)
d1 = 'robust_mahalanobis'
d2 = 'jaccard'
d3 = 'hamming'

weights = simulated_weights

gv1, gv2, gv3 = compute_geometric_var(
        X=X, 
        p1=p1, p2=p2, p3=p3, 
        d1=d1, d2=d2, d3=d3, 
        robust_method='trimmed', epsilon=0.05, alpha=0.05, n_iters=20, weights=weights
    )

X_quant = X[:, :p1]
S_estimation = S_robust(X=X_quant, method='trimmed', epsilon=0.05, alpha=0.05, n_iters=20, weights=weights)

d = generalized_gower_dist(
    xi=xi, xr=xr, 
    p1=p1, p2=p2, p3=p3, 
    d1=d1, d2=d2, d3=d3,  
    q=1, S=S_estimation, 
    geom_var_1=gv1, 
    geom_var_2=gv2, 
    geom_var_3=gv3
)

d
np.float64(2.207973237508286)

robust_mixed_dist.quantitative#

euclidean_dist_matrix#

from robust_mixed_dist.quantitative import euclidean_dist_matrix
euclidean_dist_matrix(X=X[quant_variables])
array([[      0.        ,  150002.48041163, 1550001.42564254, ...,
         200004.30910639, 2025000.65272331, 1939113.64052303],
       [ 150002.48041163,       0.        , 1700000.47214668, ...,
          50002.10458763, 2175000.34481037, 2089113.31944683],
       [1550001.42564254, 1700000.47214668,       0.        , ...,
        1750000.23836684,  475001.65333313,  389114.87041128],
       ...,
       [ 200004.30910639,   50002.10458763, 1750000.23836684, ...,
              0.        , 2225000.39656347, 2139113.36955273],
       [2025000.65272331, 2175000.34481037,  475001.65333313, ...,
        2225000.39656347,       0.        ,   85887.02978977],
       [1939113.64052303, 2089113.31944683,  389114.87041128, ...,
        2139113.36955273,   85887.02978977,       0.        ]],
      shape=(1905, 1905))

euclidean_dist#

from robust_mixed_dist.quantitative import euclidean_dist
xi = X[quant_variables][2,:]
xr = X[quant_variables][10,:]

euclidean_dist(xi=xi, xr=xr)
2400000.3317385474

minkowski_dist_matrix#

from robust_mixed_dist.quantitative import minkowski_dist_matrix
minkowski_dist_matrix(X=X[quant_variables], q=1)
array([[      0.      ,  151203.818668, 1552784.93111 , ...,
         201851.029416, 2026929.290262, 1940943.259859],
       [ 151203.818668,       0.      , 1701581.13698 , ...,
          50647.223546, 2176731.471594, 2090745.441191],
       [1552784.93111 , 1701581.13698 ,       0.      , ...,
        1750934.500526,  476533.822672,  390625.702813],
       ...,
       [ 201851.029416,   50647.223546, 1750934.500526, ...,
              0.      , 2226780.677854, 2140794.797713],
       [2026929.290262, 2176731.471594,  476533.822672, ...,
        2226780.677854,       0.      ,   85986.030403],
       [1940943.259859, 2090745.441191,  390625.702813, ...,
        2140794.797713,   85986.030403,       0.      ]],
      shape=(1905, 1905))

minkowski_dist#

from robust_mixed_dist.quantitative import minkowski_dist
xi = X[quant_variables][2,:]
xr = X[quant_variables][10,:]

minkowski_dist(xi=xi, xr=xr, q=1)
np.float64(2401294.7191080004)

canberra_dist_matrix#

from robust_mixed_dist.quantitative import canberra_dist_matrix
canberra_dist_matrix(X=X[quant_variables])
array([[0.        , 0.3791237 , 1.31009432, ..., 0.57187821, 1.25368465,
        1.15877842],
       [0.3791237 , 0.        , 1.03737598, ..., 0.20199247, 1.30874091,
        1.21638296],
       [1.31009432, 1.03737598, 0.        , ..., 0.87872481, 0.92625515,
        0.85725929],
       ...,
       [0.57187821, 0.20199247, 0.87872481, ..., 0.        , 1.31626027,
        1.22724486],
       [1.25368465, 1.30874091, 0.92625515, ..., 1.31626027, 0.        ,
        0.12046549],
       [1.15877842, 1.21638296, 0.85725929, ..., 1.22724486, 0.12046549,
        0.        ]], shape=(1905, 1905))

canberra_dist#

from robust_mixed_dist.quantitative import canberra_dist
xi = X[quant_variables][2,:]
xr = X[quant_variables][10,:]

canberra_dist(xi=xi, xr=xr)
np.float64(1.0399913488310089)

pearson_dist_matrix#

from robust_mixed_dist.quantitative import pearson_dist_matrix
pearson_dist_matrix(X=X[quant_variables])
array([[0.        , 1.20925465, 3.17227178, ..., 3.78782432, 2.9029884 ,
        2.50486894],
       [1.20925465, 0.        , 2.0785152 , ..., 3.2901998 , 2.19157364,
        1.76066423],
       [3.17227178, 2.0785152 , 0.        , ..., 4.06630612, 1.98147384,
        1.44992911],
       ...,
       [3.78782432, 3.2901998 , 4.06630612, ..., 0.        , 3.23000242,
        3.82301831],
       [2.9029884 , 2.19157364, 1.98147384, ..., 3.23000242, 0.        ,
        1.09935355],
       [2.50486894, 1.76066423, 1.44992911, ..., 3.82301831, 1.09935355,
        0.        ]], shape=(1905, 1905))

mahalanobis_dist_matrix#

from robust_mixed_dist.quantitative import mahalanobis_dist_matrix
mahalanobis_dist_matrix(X=X[quant_variables])
array([[0.        , 1.53300007, 3.59602077, ..., 2.2183579 , 3.06832405,
        2.96311246],
       [1.53300007, 0.        , 2.1252418 , ..., 0.73353908, 1.96608988,
        1.83655561],
       [3.59602077, 2.1252418 , 0.        , ..., 1.46900905, 2.11887678,
        2.0285113 ],
       ...,
       [2.2183579 , 0.73353908, 1.46900905, ..., 0.        , 1.98371456,
        1.85102134],
       [3.06832405, 1.96608988, 2.11887678, ..., 1.98371456, 0.        ,
        0.13626976],
       [2.96311246, 1.83655561, 2.0285113 , ..., 1.85102134, 0.13626976,
        0.        ]], shape=(1905, 1905))

mahalanobis_dist#

from robust_mixed_dist.quantitative import mahalanobis_dist
import numpy as np
xi = X[quant_variables][2,:]
xr = X[quant_variables][10,:]

S = np.cov(X[quant_variables], rowvar=False)

mahalanobis_dist(xi=xi, xr=xr, S=S)
np.float64(2.7759092403090455)

robust_mahalanobis_dist_matrix#

from robust_mixed_dist.quantitative import robust_mahalanobis_dist_matrix, S_robust
S_robust_trimmed = S_robust(X=X[quant_variables], method="trimmed",
                               epsilon=0.05, n_iters=20, alpha=0.07, 
                               weights=None)

robust_mahalanobis_dist_matrix(X=X[quant_variables],   
                        S_robust=S_robust_trimmed)
array([[0.        , 1.81864545, 3.91631623, ..., 2.35363057, 4.98194325,
        4.75458207],
       [1.81864545, 0.        , 2.16959868, ..., 0.6471128 , 3.43994713,
        3.20084918],
       [3.91631623, 2.16959868, 0.        , ..., 1.64792972, 2.60025908,
        2.39706372],
       ...,
       [2.35363057, 0.6471128 , 1.64792972, ..., 0.        , 3.35699379,
        3.11692523],
       [4.98194325, 3.43994713, 2.60025908, ..., 3.35699379, 0.        ,
        0.24221216],
       [4.75458207, 3.20084918, 2.39706372, ..., 3.11692523, 0.24221216,
        0.        ]], shape=(1905, 1905))
S_robust_winsorized = S_robust(X=X[quant_variables],  
                               method="winsorized", epsilon=0.05, 
                               n_iters=20, alpha=0.07, weights=None)


robust_mahalanobis_dist_matrix(X=X[quant_variables],                          
                        S_robust=S_robust_winsorized)
array([[0.        , 1.63032279, 3.65293571, ..., 2.22371162, 3.90611218,
        3.74688562],
       [1.63032279, 0.        , 2.08532543, ..., 0.66249147, 2.59842998,
        2.42433088],
       [3.65293571, 2.08532543, 0.        , ..., 1.52111172, 2.10402449,
        1.9736514 ],
       ...,
       [2.22371162, 0.66249147, 1.52111172, ..., 0.        , 2.51448355,
        2.33923374],
       [3.90611218, 2.59842998, 2.10402449, ..., 2.51448355, 0.        ,
        0.17812079],
       [3.74688562, 2.42433088, 1.9736514 , ..., 2.33923374, 0.17812079,
        0.        ]], shape=(1905, 1905))
S_robust_MAD = S_robust(X=X[quant_variables], method="MAD", epsilon=0.05, 
                               n_iters=20, alpha=None, weights=None)

robust_mahalanobis_dist_matrix(X=X[quant_variables], S_robust=S_robust_MAD)
array([[0.        , 1.86757181, 3.95871434, ..., 2.36822465, 5.39713175,
        5.13977515],
       [1.86757181, 0.        , 2.17155508, ..., 0.64240264, 3.80101401,
        3.5324399 ],
       [3.95871434, 2.17155508, 0.        , ..., 1.67434971, 2.92445535,
        2.68698538],
       ...,
       [2.36822465, 0.64240264, 1.67434971, ..., 0.        , 3.75496147,
        3.48534267],
       [5.39713175, 3.80101401, 2.92445535, ..., 3.75496147, 0.        ,
        0.27150497],
       [5.13977515, 3.5324399 , 2.68698538, ..., 3.48534267, 0.27150497,
        0.        ]], shape=(1905, 1905))

robust_mahalanobis_dist#

from robust_mixed_dist.quantitative import robust_mahalanobis_dist, S_robust
xi = X[quant_variables][0,:]
xr = X[quant_variables][2,:]
S_robust_trimmed = S_robust(X=X[quant_variables], method="trimmed", 
                               epsilon=0.05, n_iters=20, alpha=0.07, 
                               weights=None)

robust_mahalanobis_dist(xi=xi, xr=xr, S_robust=S_robust_trimmed)
np.float64(3.9163162282135637)
S_robust_winsorized = S_robust(X=X[quant_variables],                   
                               method="winsorized", epsilon=0.05, 
                               n_iters=20, alpha=0.07, weights=None)

robust_mahalanobis_dist(xi=xi, xr=xr, S_robust=S_robust_winsorized)
np.float64(3.6529357075358972)
S_robust_MAD = S_robust(X=X[quant_variables], method="MAD",
                               epsilon=0.05, n_iters=20, alpha=None,
                               weights=None)

robust_mahalanobis_dist(xi=xi, xr=xr, S_robust=S_robust_MAD)
np.float64(3.9587143448190836)

robust_mixed_dist.binary#

sokal_dist_matrix#

from robust_mixed_dist.binary import sokal_dist_matrix
sokal_dist_matrix(X=X[binary_variables])
array([[0.        , 0.33333333, 0.33333333, ..., 0.33333333, 0.57142857,
        0.        ],
       [0.33333333, 0.        , 0.        , ..., 0.        , 0.33333333,
        0.33333333],
       [0.33333333, 0.        , 0.        , ..., 0.        , 0.33333333,
        0.33333333],
       ...,
       [0.33333333, 0.        , 0.        , ..., 0.        , 0.33333333,
        0.33333333],
       [0.57142857, 0.33333333, 0.33333333, ..., 0.33333333, 0.        ,
        0.57142857],
       [0.        , 0.33333333, 0.33333333, ..., 0.33333333, 0.57142857,
        0.        ]], shape=(1905, 1905))

sokal_dist#

from robust_mixed_dist.binary import sokal_dist
xi = X[quant_variables][2,:]
xr = X[quant_variables][10,:]

sokal_dist(xi=xi, xr=xr)
2.000000000001225

jaccard_dist_matrix#

from robust_mixed_dist.binary import jaccard_dist_matrix
jaccard_dist_matrix(X=X[binary_variables])
array([[0.        , 0.33333333, 0.33333333, ..., 0.33333333, 0.5       ,
        0.        ],
       [0.33333333, 0.        , 0.        , ..., 0.        , 0.33333333,
        0.33333333],
       [0.33333333, 0.        , 0.        , ..., 0.        , 0.33333333,
        0.33333333],
       ...,
       [0.33333333, 0.        , 0.        , ..., 0.        , 0.33333333,
        0.33333333],
       [0.5       , 0.33333333, 0.33333333, ..., 0.33333333, 0.        ,
        0.5       ],
       [0.        , 0.33333333, 0.33333333, ..., 0.33333333, 0.5       ,
        0.        ]], shape=(1905, 1905))

jaccard_dist#

from robust_mixed_dist.binary import jaccard_dist
xi = X[quant_variables][2,:]
xr = X[quant_variables][10,:]

jaccard_dist(xi=xi, xr=xr)
np.float64(0.0)

robust_mixed_dist.multiclass#

hamming_dist_matrix#

from robust_mixed_dist.multiclass import hamming_dist_matrix
hamming_dist_matrix(X=X[multiclass_variables])
array([[0. , 0.5, 1. , ..., 1. , 0. , 0. ],
       [0.5, 0. , 1. , ..., 1. , 0.5, 0.5],
       [1. , 1. , 0. , ..., 0. , 1. , 1. ],
       ...,
       [1. , 1. , 0. , ..., 0. , 1. , 1. ],
       [0. , 0.5, 1. , ..., 1. , 0. , 0. ],
       [0. , 0.5, 1. , ..., 1. , 0. , 0. ]], shape=(1905, 1905))

hamming_dist#

from robust_mixed_dist.multiclass import hamming_dist
xi = X[quant_variables][0,:]
xr = X[quant_variables][2,:]

hamming_dist(xi=xi, xr=xr)
np.float64(1.0)