quantitative#

euclidean_dist_matrix#

Calculates the Euclidean distance matrix for a data matrix using SciPy.

Parameters (inputs)
----------
X: a Pandas or Polars DataFrame or a NumPy array. It represents a data matrix.

Returns (outputs)
-------
M: the Euclidean distance matrix between the rows of `X`.

Example#

import pandas as pd
from PyDistances.quantitative import euclidean_dist_matrix

data_url = "https://raw.githubusercontent.com/FabioScielzoOrtiz/PyDistances-demo/refs/heads/main/data/madrid_houses_processed.csv"

data = pd.read_csv(data_url)

quant_cols = ['sq_mt_built', 'n_rooms', 'n_bathrooms', 'n_floors', 'buy_price']
 
euclidean_dist_matrix(X=data[quant_cols])
array([[     0.        ,  44900.00041203,  59247.00760376, ...,
        595000.01035798, 610000.04150574, 339000.00009587],
       [ 44900.00041203,      0.        ,  14347.02014357, ...,
        550100.01002272, 565100.04244381, 294100.0000102 ],
       [ 59247.00760376,  14347.02014357,      0.        , ...,
        535753.00612689, 550753.03452909, 279753.00086505],
       ...,
       [595000.01035798, 550100.01002272, 535753.00612689, ...,
             0.        ,  15000.43336041, 256000.02072851],
       [610000.04150574, 565100.04244381, 550753.03452909, ...,
         15000.43336041,      0.        , 271000.08689667],
       [339000.00009587, 294100.0000102 , 279753.00086505, ...,
        256000.02072851, 271000.08689667,      0.        ]])

euclidean_dist#

Calculates the Euclidean distance between a pair of vectors.

Parameters (inputs)
----------
xi, xr: a pair of Pandas or Polars Series or DataFrames, or Numpy arrays. They represent a couple of statistical observations of quantitative variables. 

Returns (outputs)
-------
The Euclidean distance between the observations `xi` and `xr`.

Example#

import pandas as pd
from PyDistances.quantitative import euclidean_dist

data_url = "https://raw.githubusercontent.com/FabioScielzoOrtiz/PyDistances-demo/refs/heads/main/data/madrid_houses_processed.csv"

data = pd.read_csv(data_url)

quant_cols = ['sq_mt_built', 'n_rooms', 'n_bathrooms', 
              'n_floors', 'buy_price']

xi = data[quant_cols].iloc[2,:]
xr = data[quant_cols].iloc[10,:]

euclidean_dist(xi=xi, xr=xr)
26247.011906119904

minkowski_dist_matrix#

Calculates the Minkowski distance matrix for a data matrix using SciPy.

Parameters (inputs)
----------
X: a Pandas or Polars DataFrame or a NumPy array. It represents a data matrix.
q: the parameters that defines the Minkowski form. Some particular cases: q=1 := Manhattan, q=2 := Euclidean.

Returns (outputs)
-------
M: the Minkowski(`q`) distance matrix between the rows of `X`.

Example#

import pandas as pd
from PyDistances.quantitative import minkowski_dist_matrix 

data_url = "https://raw.githubusercontent.com/FabioScielzoOrtiz/PyDistances-demo/refs/heads/main/data/madrid_houses_processed.csv"

data = pd.read_csv(data_url)

quant_cols = ['sq_mt_built', 'n_rooms', 'n_bathrooms', 'n_floors', 'buy_price']
   
minkowski_dist_matrix(X=data[quant_cols], q=1)
array([[     0.,  44907.,  59278., ..., 595114., 610231., 339009.],
       [ 44907.,      0.,  14373., ..., 550207., 565324., 294104.],
       [ 59278.,  14373.,      0., ..., 535836., 550953., 279775.],
       ...,
       [595114., 550207., 535836., ...,      0.,  15117., 256105.],
       [610231., 565324., 550953., ...,  15117.,      0., 271222.],
       [339009., 294104., 279775., ..., 256105., 271222.,      0.]])

minkowski_dist#

Calculates the Minkowski distance between a pair of vectors.

Parameters (inputs)
----------
xi, xr: a pair of quantitative vectors. They represent a couple of statistical observations.
q: the parameters that defines the Minkowski form. Some particular cases: q=1 := Manhattan, q=2 := Euclidean.

Returns (outputs)
-------
The Minkowki(`q`) distance between the observations `xi` and `xr`.

Example#

import pandas as pd
from PyDistances.quantitative import minkowski_dist

data_url = "https://raw.githubusercontent.com/FabioScielzoOrtiz/PyDistances-demo/refs/heads/main/data/madrid_houses_processed.csv"

data = pd.read_csv(data_url)

quant_cols = ['sq_mt_built', 'n_rooms', 'n_bathrooms', 'n_floors', 'buy_price']

xi = data[quant_cols].iloc[2,:]
xr = data[quant_cols].iloc[10,:]

minkowski_dist(xi=xi, xr=xr, q=1)
26272.0

canberra_dist_matrix#

Calculates the Canberra distance matrix for a data matrix using SciPy.

Parameters (inputs)
----------
X: a pandas/polars DataFrame or a NumPy array. It represents a data matrix.

Returns (outputs)
-------
M: the Canberra distance matrix between the rows of `X`.

Example#

import pandas as pd
from PyDistances.quantitative import  

data_url = "https://raw.githubusercontent.com/FabioScielzoOrtiz/PyDistances-demo/refs/heads/main/data/madrid_houses_processed.csv"

data = pd.read_csv(data_url)

quant_cols = ['sq_mt_built', 'n_rooms', 'n_bathrooms', 'n_floors', 'buy_price']

canberra_dist_matrix(X=data[quant_cols])
array([[0.        , 0.45371051, 0.78164852, ..., 1.90887959, 2.75277838,
        1.05816865],
       [0.45371051, 0.        , 0.73200803, ..., 1.58398156, 2.43793773,
        1.07838011],
       [0.78164852, 0.73200803, 0.        , ..., 1.28443942, 2.19871833,
        0.62483892],
       ...,
       [1.90887959, 1.58398156, 1.28443942, ..., 0.        , 0.95659875,
        0.98222144],
       [2.75277838, 2.43793773, 2.19871833, ..., 0.95659875, 0.        ,
        1.87662188],
       [1.05816865, 1.07838011, 0.62483892, ..., 0.98222144, 1.87662188,
        0.        ]])

canberra_dist#

Calculates the Canberra distance between a pair of vectors.

Parameters (inputs)
----------
xi, xr: a pair of quantitative vectors. They represent a couple of statistical observations.

Returns (outputs)
-------
The Canberra distance between the observations `xi` and `xr`.

Example#

import pandas as pd
from PyDistances.quantitative import canberra_dist 

data_url = "https://raw.githubusercontent.com/FabioScielzoOrtiz/PyDistances-demo/refs/heads/main/data/madrid_houses_processed.csv"

data = pd.read_csv(data_url)

quant_cols = ['sq_mt_built', 'n_rooms', 'n_bathrooms', 'n_floors', 'buy_price']

xi = data[quant_cols].iloc[2,:]
xr = data[quant_cols].iloc[10,:]

canberra_dist(xi=xi, xr=xr)
0.25345926746669145

pearson_dist_matrix#

Calculates the Pearson distance matrix for a data matrix using SciPy.

Parameters (inputs)
----------
X: a pandas/polars DataFrame or a NumPy array. It represents a data matrix.

Returns (outputs)
-------
M: the Pearson distance matrix between the rows of X.

Example#

import pandas as pd
from PyDistances.quantitative import pearson_dist_matrix  

data_url = "https://raw.githubusercontent.com/FabioScielzoOrtiz/PyDistances-demo/refs/heads/main/data/madrid_houses_processed.csv"

data = pd.read_csv(data_url)

quant_cols = ['sq_mt_built', 'n_rooms', 'n_bathrooms', 'n_floors', 'buy_price']

pearson_dist_matrix(X=data[quant_cols])
array([[0.        , 0.66557805, 0.73750837, ..., 1.81426832, 3.76742474,
        0.83385122],
       [0.66557805, 0.        , 0.98231536, ..., 1.35816365, 3.56285077,
        1.04162306],
       [0.73750837, 0.98231536, 0.        , ..., 1.56993872, 3.47873654,
        0.38187408],
       ...,
       [1.81426832, 1.35816365, 1.56993872, ..., 0.        , 2.98830222,
        1.50055625],
       [3.76742474, 3.56285077, 3.47873654, ..., 2.98830222, 0.        ,
        3.4727102 ],
       [0.83385122, 1.04162306, 0.38187408, ..., 1.50055625, 3.4727102 ,
        0.        ]])

mahalanobis_dist_matrix#

Calculates the Mahalanobis distance matrix for a data matrix using SciPy.

Parameters (inputs)
----------
X: a pandas/polars DataFrame or a NumPy array. It represents a data matrix.

Returns (outputs)
-------
M: the Mahalanobis distance matrix between the rows of X.

Example#

import pandas as pd
from PyDistances.quantitative import  

data_url = "https://raw.githubusercontent.com/FabioScielzoOrtiz/PyDistances-demo/refs/heads/main/data/madrid_houses_processed.csv"

data = pd.read_csv(data_url)

quant_cols = ['sq_mt_built', 'n_rooms', 'n_bathrooms', 'n_floors', 'buy_price']

mahalanobis_dist_matrix(X=data[quant_cols])
array([[0.        , 0.98305366, 1.35352819, ..., 1.51225082, 2.9059013 ,
        1.39140298],
       [0.98305366, 0.        , 2.03435122, ..., 0.95079695, 2.95281549,
        2.01444275],
       [1.35352819, 2.03435122, 0.        , ..., 2.12708355, 3.26187536,
        0.82635883],
       ...,
       [1.51225082, 0.95079695, 2.12708355, ..., 0.        , 3.28213849,
        2.08421407],
       [2.9059013 , 2.95281549, 3.26187536, ..., 3.28213849, 0.        ,
        3.30618935],
       [1.39140298, 2.01444275, 0.82635883, ..., 2.08421407, 3.30618935,
        0.        ]])

mahalanobis_dist#

Calculates the Mahalanobis distance between a pair of vectors.

Parameters (inputs)
----------
xi, xr: a pair of quantitative vectors. They represent a couple of statistical observations.
S: the covariance matrix of the data matrix to which `xi` and `xr` belong.

Returns (outputs)
-------
The Mahalanobis distance between the observations `xi` and `xr`.

Example#

import pandas as pd
from PyDistances.quantitative import  

data_url = "https://raw.githubusercontent.com/FabioScielzoOrtiz/PyDistances-demo/refs/heads/main/data/madrid_houses_processed.csv"

data = pd.read_csv(data_url)

quant_cols = ['sq_mt_built', 'n_rooms', 'n_bathrooms', 'n_floors', 'buy_price']

xi = data[quant_cols].iloc[2,:]
xr = data[quant_cols].iloc[10,:]

S = np.cov(data_pd[quant_cols], rowvar=False)

mahalanobis_dist(xi=xi, xr=xr, S=S)
20524268507.123516

robust_maha_dist_matrix#

Calculates the Robust Mahalanobis distance matrix for a data matrix `X` using SciPy and a robust estimation of the covariance matrix.

Parameters (inputs)
----------
X: a pandas/polars DataFrame or a NumPy array. It represents a data matrix.
S_robust: the robust covariance matrix of `X`.

Returns (outputs)
-------
M: the Robust Mahalanobis distance matrix between the rows of X.

Example#

import pandas as pd
from PyDistances.quantitative import robust_maha_dist_matrix, S_robust  

data_url = "https://raw.githubusercontent.com/FabioScielzoOrtiz/PyDistances-demo/refs/heads/main/data/madrid_houses_processed.csv"

data = pd.read_csv(data_url)

quant_cols = ['sq_mt_built', 'n_rooms', 'n_bathrooms', 'n_floors', 'buy_price']        
S_robust_estimation = S_robust(X=data[quant_cols], method="trimmed",
                               epsilon=0.05, n_iters=20, alpha=0.07, 
                               weights=None)

robust_maha_dist_matrix(X=data[quant_cols],   
                        S_robust=S_robust_estimation)
array([[0.        , 1.07619417, 1.52755549, ..., 1.6039788 , 3.03495824,
        1.64495744],
       [1.07619417, 0.        , 2.26168841, ..., 0.93299896, 3.15055951,
        2.09739707],
       [1.52755549, 2.26168841, 0.        , ..., 2.39613957, 3.24280591,
        1.28951587],
       ...,
       [1.6039788 , 0.93299896, 2.39613957, ..., 0.        , 3.48370099,
        2.1326725 ],
       [3.03495824, 3.15055951, 3.24280591, ..., 3.48370099, 0.        ,
        3.52339443],
       [1.64495744, 2.09739707, 1.28951587, ..., 2.1326725 , 3.52339443,
        0.        ]])
S_robust_estimation = S_robust(X=data[quant_cols],  
                               method="winsorized", epsilon=0.05, 
                               n_iters=20, alpha=0.07, weights=None)


robust_maha_dist_matrix(X=data[quant_cols],                          
                        S_robust=S_robust_estimation)
array([[0.        , 1.04388227, 1.52580544, ..., 1.58209452, 2.9790249 ,
        1.62043215],
       [1.04388227, 0.        , 2.1919243 , ..., 0.97608111, 3.02378791,
        2.0634736 ],
       [1.52580544, 2.1919243 , 0.        , ..., 2.25328446, 3.36744463,
        1.21972552],
       ...,
       [1.58209452, 0.97608111, 2.25328446, ..., 0.        , 3.40773227,
        2.09940077],
       [2.9790249 , 3.02378791, 3.36744463, ..., 3.40773227, 0.        ,
        3.42871647],
       [1.62043215, 2.0634736 , 1.21972552, ..., 2.09940077, 3.42871647,
        0.        ]])
S_robust_estimation = S_robust(X=data[quant_cols], method="MAD", epsilon=0.05, 
                               n_iters=20, alpha=None, weights=None)

robust_maha_dist_matrix(X=data[quant_cols], S_robust=S_robust_estimation)
array([[0.        , 0.92229336, 0.97058617, ..., 1.78839817, 3.60593355,
        1.04126762],
       [0.92229336, 0.        , 1.02265462, ..., 1.0354775 , 3.3402759 ,
        1.01713595],
       [0.97058617, 1.02265462, 0.        , ..., 1.4612904 , 3.24480033,
        0.62190049],
       ...,
       [1.78839817, 1.0354775 , 1.4612904 , ..., 0.        , 3.09615949,
        1.35288379],
       [3.60593355, 3.3402759 , 3.24480033, ..., 3.09615949, 0.        ,
        3.39834726],
       [1.04126762, 1.01713595, 0.62190049, ..., 1.35288379, 3.39834726,
        0.        ]])

robust_maha_dist#

Calculates the Robust Mahalanobis distance between a pair of vectors.

Parameters (inputs)
----------
xi, xr: a pair of quantitative vectors. They represent a couple of statistical observations.
S_robust: the robust covariance matrix of the data matrix to which `xi` and `xr` belong.

Returns (outputs)
-------
The Robust Mahalanobis distance between the observations `xi` and `xr`.

Example#

import pandas as pd
from PyDistances.quantitative import robust_maha_dist, S_robust 

data_url = "https://raw.githubusercontent.com/FabioScielzoOrtiz/PyDistances-demo/refs/heads/main/data/madrid_houses_processed.csv"

data = pd.read_csv(data_url)

quant_cols = ['sq_mt_built', 'n_rooms', 'n_bathrooms', 'n_floors', 'buy_price']

xi = data[quant_cols].iloc[2,:]
xr = data[quant_cols].iloc[10,:]
S_robust_estimation = S_robust(X=data[quant_cols], method="trimmed", 
                               epsilon=0.05, n_iters=20, alpha=0.07, 
                               weights=None)

robust_maha_dist(xi=xi, xr=xr, S_robust=S_robust_estimation)
0.45774797743084744
S_robust_estimation = S_robust(X=data[quant_cols],                   
                               method="winsorized", epsilon=0.05, 
                               n_iters=20, alpha=0.07, weights=None)

robust_maha_dist(xi=xi, xr=xr, S_robust=S_robust_estimation)
0.45545084258890073
S_robust_estimation = S_robust(X=data[quant_cols], method="MAD",
                               epsilon=0.05, n_iters=20, alpha=None,
                               weights=None)

robust_maha_dist(xi=xi, xr=xr, S_robust=S_robust_estimation)
0.2838731015529872