Extracting and analyzing data form idescat.com#

Requirements#

import requests
import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
sns.set_style("whitegrid")

Maria’s analysis in Catalonia#

Extracting and processing the data#

years = ['2018', '2019', '2020', '2021', '2022']
Maria_abs_freq, Maria_total_position, Maria_girls_position, Maria_total_percent, Maria_girls_percent = [], [], [], [], []

for year in years :

    url = f'https://api.idescat.cat/onomastica/v1/nadons/dades.json?id=40683&t={year}&lang=es'
    response = requests.get(url)
    data_json = response.json() 

    Maria_abs_freq.append(data_json['onomastica_nadons']['ff']['f']['pos1']['v']) # abs freq name Maria over the names of childs born in Catalonia in {year}
    Maria_total_position.append(data_json['onomastica_nadons']['ff']['f']['rank']['total']) # rank of name Maria over the names of childs born in Catalonia in {year}
    Maria_girls_position.append(data_json['onomastica_nadons']['ff']['f']['rank']['sex']) # rank of name Maria over the names of girls born in Catalonia in {year}
    Maria_total_percent.append(data_json['onomastica_nadons']['ff']['f']['pos1']['w']['total']) # % of childs born in Catalonia in {year} whose name is Maria
    Maria_girls_percent.append(data_json['onomastica_nadons']['ff']['f']['pos1']['w']['sex'])  # % of girls born in Catalonia in {year} whose name is Maria

Maria_dict = dict()
Maria_dict['years'] = years
Maria_dict['Maria_abs_freq'] = Maria_abs_freq
Maria_dict['Maria_total_position'] = Maria_total_position
Maria_dict['Maria_girls_position'] = Maria_girls_position
Maria_dict['Maria_total_percent'] = Maria_total_percent
Maria_dict['Maria_girls_percent'] = Maria_girls_percent

Maria_df = pd.DataFrame(Maria_dict)

for col_name in Maria_df.columns :
    if col_name in ['Maria_abs_freq', 'Maria_total_position', 'Maria_girls_position'] :
        Maria_df[col_name] = Maria_df[col_name].astype('int64')
    elif col_name in ['Maria_total_percent', 'Maria_girls_percent'] :
        Maria_df[col_name] = Maria_df[col_name].astype('float64')
# Data-frame with the data

print('Maria\'s analysis in Catalonia\n')
display(Maria_df)
Maria's analysis in Catalonia
years Maria_abs_freq Maria_total_position Maria_girls_position Maria_total_percent Maria_girls_percent
0 2018 466 8 4 7.38 15.15
1 2019 387 13 6 6.36 13.00
2 2020 318 20 10 5.49 11.23
3 2021 327 19 8 5.67 11.65
4 2022 267 28 13 4.74 9.83

Plotting the results#

selected_columns = [x for x in Maria_df.columns if x != 'years']
titles = ["Absolute Frequencies Maria in Catalonia", "Ranking: Maria total position in Catalonia",
          "Ranking: Maria girls position in Catalonia", "Maria total % in Catalonia", "Maria girls % in Catalonia"]

# Define the number of rows and columns for the matrix plot
num_cols = 2  # You can adjust the number of columns as needed
num_rows = int(np.ceil(len(selected_columns) / num_cols))

# Create a subplot with the specified number of rows and columns
fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 10))

# Flatten the axes array to make it easier to iterate
axes = axes.flatten()

colors = sns.color_palette("tab10", len(selected_columns))

# Loop through each 'geo' and create a subplot in the matrix
for (i, col_name), color, title in zip(enumerate(selected_columns), colors, titles) :
    ax = axes[i]  # Get the current axis
    sns.barplot(x="years", y=col_name, data=Maria_df, color=color, ax=ax)
    ax.set_title(f"{title}", fontsize=13)
    xticks_index = np.arange(0, len(Maria_df), 1)
    ax.set_xticks(xticks_index)
    ax.tick_params(axis='x', rotation=0)
    ax.set_xlabel('Year')
    ax.set_ylabel('')

# Remove any unused subplots in case the number of 'geo' values is less than num_rows * num_cols
for j in range(len(selected_columns), num_rows * num_cols):
    fig.delaxes(axes[j])

plt.suptitle('Maria\'s born in Catalonia - Statistical information\n Last five years (2018-2022)', fontsize=15, y=0.98, weight='bold') # Establishing a general tittle for the plot.
plt.subplots_adjust(hspace=0.5, wspace=0.25) # Adjust vertical (hspace) and horizontal (wspace) spacing
fig.savefig('Marias_Catalonia' + '.jpg', format='jpg', dpi=550)
# plt.tight_layout()
plt.show()
_images/eb29e9f6b6b6f5e9721a368e548e15cea1cdcd95ce34cafe630e00504d592d89.png

Maria’s analysis in Catalonia by city (comarca)#

Extracting and processing the data#

print('Maria\'s analysis in Catalonia by city (comarca)\n')

years = ['2018', '2019', '2020', '2021', '2022']
Maria_df_dict = dict()

for com_id in range(0, 42): # 42 = len(data_json['onomastica_nadons']['ff']['f']) - 1

    Maria_abs_freq, Maria_total_position, Maria_girls_position, Maria_total_percent, Maria_girls_percent = [], [], [], [], []

    for year in years :

        url = f'https://api.idescat.cat/onomastica/v1/nadons/dades.json?id=40683&class=com&t={year}&lang=es'
        response = requests.get(url)
        data_json = response.json() 

        Maria_abs_freq.append(data_json['onomastica_nadons']['ff']['f'][com_id]['pos1']['v']) # abs freq name Maria over the names of childs born in comarca {com_id} in {year}
        Maria_total_position.append(data_json['onomastica_nadons']['ff']['f'][com_id]['rank']['total']) # rank of name Maria over the names of childs born in comarca {com_id} in {year}
        Maria_girls_position.append(data_json['onomastica_nadons']['ff']['f'][com_id]['rank']['sex']) # rank of name Maria over the names of girls born in comarca {com_id} in {year}
        Maria_total_percent.append(data_json['onomastica_nadons']['ff']['f'][com_id]['pos1']['w']['total']) # % of childs born in comarca {com_id} in {year} whose name is Maria
        Maria_girls_percent.append(data_json['onomastica_nadons']['ff']['f'][com_id]['pos1']['w']['sex'])  # % of girls born in comarca {com_id} in {year} whose name is Maria

    Maria_dict = dict()
    Maria_dict['years'] = years
    Maria_dict['Maria_abs_freq'] = Maria_abs_freq
    Maria_dict['Maria_total_position'] = Maria_total_position
    Maria_dict['Maria_girls_position'] = Maria_girls_position
    Maria_dict['Maria_total_percent'] = Maria_total_percent
    Maria_dict['Maria_girls_percent'] = Maria_girls_percent  
    com = data_json['onomastica_nadons']['ff']['f'][com_id]['c']['content']
    Maria_df_dict[com] = pd.DataFrame(Maria_dict)
Maria's analysis in Catalonia by city (comarca)
###  Catalonia city (comarca) with more Maria\'s born by year

Maria_abs_freq_year = dict()

for year in ['2018', '2019', '2020', '2021', '2022']  :

    Maria_abs_freq_city = dict()

    for com in Maria_df_dict.keys() :

        value =  Maria_df_dict[com].loc[Maria_df_dict[com]['years'] == year, 'Maria_abs_freq'].iloc[0]

        if value != '_' : # To avoid the cities with missing value.
        
            Maria_abs_freq_city[com] = value

    Maria_abs_freq_year[year] = Maria_abs_freq_city


Maria_abs_freq_values = dict()
Maria_abs_freq_index = dict()
city_max_Maria = dict()
years =  ['2018', '2019', '2020', '2021', '2022'] 

for year in years :

    Maria_abs_freq_values[year] = np.array([x for x in Maria_abs_freq_year[year].values()], dtype=int)
    Maria_abs_freq_index[year] = np.array([x for x in Maria_abs_freq_year[year].keys()])
    city_max_Maria[year] = Maria_abs_freq_index[year][np.argmax(Maria_abs_freq_values[year])]
    print(f'Catalonia city (comarca) with more Maria\'s born in {year} -->', city_max_Maria[year])
Catalonia city (comarca) with more Maria's born in 2018 --> Barcelonès
Catalonia city (comarca) with more Maria's born in 2019 --> Barcelonès
Catalonia city (comarca) with more Maria's born in 2020 --> Barcelonès
Catalonia city (comarca) with more Maria's born in 2021 --> Barcelonès
Catalonia city (comarca) with more Maria's born in 2022 --> Barcelonès

Plotting the results#

# Define the number of rows and columns for the matrix plot
num_cols = 3  # You can adjust the number of columns as needed
num_rows = int(np.ceil(len(years) / num_cols))

# Create a subplot with the specified number of rows and columns
fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 13))

# Flatten the axes array to make it easier to iterate
axes = axes.flatten()

colors = sns.color_palette("tab10", len(years))

# Loop through each 'geo' and create a subplot in the matrix
for i, color, year in zip(range(0,len(years)), colors, years) :

    ax = axes[i]  # Get the current axis
    sns.barplot(x=Maria_abs_freq_values[year], y=Maria_abs_freq_index[year], color=color, ax=ax)
    ax.set_title(f"{year}", fontsize=13)
    ax.tick_params(axis='x', rotation=0)
    ax.set_xlabel('Count')
    ax.set_ylabel('Cities')

# Remove any unused subplots in case the number of 'geo' values is less than num_rows * num_cols
for j in range(len(years), num_rows * num_cols):
    fig.delaxes(axes[j])

plt.suptitle('Absolute Frequencies Maria by Catalonia cities (comarcas)\n Last five years (2018-2022)', fontsize=15, y=0.95, weight='bold') # Establishing a general tittle for the plot.
plt.subplots_adjust(hspace=0.25, wspace=0.8) # Adjust vertical (hspace) and horizontal (wspace) spacing
# fig.savefig('Marias_Areas_Catalonia' + '.jpg', format='jpg', dpi=550)
# plt.tight_layout()
plt.show()
_images/b9f3dc59349abd1f5ccd6ce483db9c9876bc87a8666d068b0497f62a286e8ff3.png

Gender info for the childs born in Catalonia in the last nine years (2014-2022)#

Extracting and processing the data#

def try_float(x):   
    try:
        return float(x)
    except ValueError:
        return None

url = 'https://www.idescat.cat/indicadors/?id=aec&n=15237&fil=43&lang=en'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

table = soup.find('table')

columns = table.find(class_="cap") 
columns_html = columns.select("thead th")
columns_list = []

for i in range(1,len(columns_html)) :
    columns_list.append(columns.select("thead th")[i].text)

columns_list = ['Year'] + columns_list
tr_list = table.select('tr')
useful_info_index = range(1,(len(tr_list)-3))

rows = dict()
for i , r in enumerate(useful_info_index) :
    text_data = tr_list[r].text
    row_data = text_data.replace('\n', ' ').split()
    row_data = [x.replace(',', '.') for x in row_data]
    row_data = [try_float(x) for x in row_data]
    row_data = [x for x in row_data if x != None]
    rows[i] = row_data

df = pd.DataFrame(rows)
df = df.T
df.columns = columns_list
df['Year'] = df['Year'].astype('int')
df['Boys_prop'] = round(df['Boys'] / df['Total'], 3)
df['Girls_prop'] = round(df['Girls'] / df['Total'], 3)
# Data-frame with the data

print('Gender info for the childs born in Catalonia in the last nine years (2014-2022)\n')
display(df)
Gender info for the childs born in Catalonia in the last nine years (2014-2022)
Year Boys Girls Total Boys_prop Girls_prop
0 2022 29.187 27.195 56.382 0.518 0.482
1 2021 29.604 28.030 57.634 0.514 0.486
2 2020 29.832 28.632 58.464 0.510 0.490
3 2019 31.453 30.095 61.548 0.511 0.489
4 2018 32.623 30.943 63.566 0.513 0.487
5 2017 34.462 32.341 66.803 0.516 0.484
6 2016 35.435 33.539 68.974 0.514 0.486
7 2015 36.406 34.044 70.450 0.517 0.483

Plotting the results#

selected_columns = [x for x in df.columns if x in ['Boys_prop', 'Girls_prop']]

# Define the number of rows and columns for the matrix plot
num_cols = 2  # You can adjust the number of columns as needed
num_rows = int(np.ceil(len(selected_columns) / num_cols))

# Create a subplot with the specified number of rows and columns
fig, axes = plt.subplots(num_rows, num_cols, figsize=(8, 3))

# Flatten the axes array to make it easier to iterate
axes = axes.flatten()

colors = sns.color_palette("tab10", len(selected_columns))
titles = ['Girls', 'Boys']

# Loop through each 'geo' and create a subplot in the matrix
for i, color, col_name, title in zip(range(0,len(selected_columns)), colors, selected_columns, titles) :
    ax = axes[i]  # Get the current axis
    sns.barplot(x="Year", y=col_name, data=df, color=color, ax=ax)
    ax.set_title(f"{title}", fontsize=11)
    xticks_index = np.arange(0, len(df), 1)
    ax.set_xticks(xticks_index)
    ax.tick_params(axis='x', rotation=0, labelsize=9)
    ax.tick_params(axis='y', rotation=0, labelsize=9)
    ax.set_xlabel('Year')
    ax.set_ylabel(' ')
    ax.set_ylim([0.45, 0.54])

# Remove any unused subplots in case the number of 'geo' values is less than num_rows * num_cols
for j in range(len(selected_columns), num_rows * num_cols):
    fig.delaxes(axes[j])

plt.suptitle('Relative frequencies of the gender of childs born in Catalonia\n Last nine years (2014-2022)', fontsize=12, y=1.1, weight='bold') # Establishing a general tittle for the plot.
plt.subplots_adjust(hspace=0.6, wspace=0.3) # Adjust vertical (hspace) and horizontal (wspace) spacing
# fig.savefig('Gender_born_Catalonia' + '.jpg', format='jpg', dpi=550)
# plt.tight_layout()
plt.show()
_images/b01f0738ebb5113ceb608e2468e8d1db733778e4d8bf845f77182809e8e79dd0.png

Maria’s analysis in Catalonia by province#

Extracting and processing the data#

years = ['2018', '2019', '2020', '2021', '2022']
Maria_df_dict = dict()

for prov_id in range(0, 4): # 4 = len(data_json['onomastica_nadons']['ff']['f']) - 1

    Maria_abs_freq, Maria_total_position, Maria_girls_position, Maria_total_percent, Maria_girls_percent = [], [], [], [], []

    for year in years :

        url = f'https://api.idescat.cat/onomastica/v1/nadons/dades.json?id=40683&class=prov&t={year}&lang=es'
        response = requests.get(url)
        data_json = response.json() 

        Maria_abs_freq.append(data_json['onomastica_nadons']['ff']['f'][prov_id]['pos1']['v']) # abs freq name Maria over the names of childs born in province {prov_id} in {year}
        Maria_total_position.append(data_json['onomastica_nadons']['ff']['f'][prov_id]['rank']['total']) # rank of name Maria over the names of childs born in province {prov_id} in {year}
        Maria_girls_position.append(data_json['onomastica_nadons']['ff']['f'][prov_id]['rank']['sex']) # rank of name Maria over the names of girls born in province {prov_id} in {year}
        Maria_total_percent.append(data_json['onomastica_nadons']['ff']['f'][prov_id]['pos1']['w']['total']) # % of childs born in province {prov_id} in {year} whose name is Maria
        Maria_girls_percent.append(data_json['onomastica_nadons']['ff']['f'][prov_id]['pos1']['w']['sex'])  # % of girls born in province {prov_id} in {year} whose name is Maria

    Maria_dict = dict()
    Maria_dict['years'] = years
    Maria_dict['Maria_abs_freq'] = Maria_abs_freq
    Maria_dict['Maria_total_position'] = Maria_total_position
    Maria_dict['Maria_girls_position'] = Maria_girls_position
    Maria_dict['Maria_total_percent'] = Maria_total_percent
    Maria_dict['Maria_girls_percent'] = Maria_girls_percent  
    prov = data_json['onomastica_nadons']['ff']['f'][prov_id]['c']['content']
    Maria_df_dict[prov] = pd.DataFrame(Maria_dict)
print('Maria\'s analysis in Catalonia by province')

for prov in Maria_df_dict.keys():
    print('-----------------------------------------------------------------------------------------------')
    print(prov)
    display(Maria_df_dict[prov])
Maria's analysis in Catalonia by province
-----------------------------------------------------------------------------------------------
Barcelona
years Maria_abs_freq Maria_total_position Maria_girls_position Maria_total_percent Maria_girls_percent
0 2018 344 11 5 7.41 15.19
1 2019 288 14 7 6.46 13.17
2 2020 245 19 9 5.81 11.89
3 2021 _ _ _ _ _
4 2022 189 33 14 4.59 9.56
-----------------------------------------------------------------------------------------------
Girona
years Maria_abs_freq Maria_total_position Maria_girls_position Maria_total_percent Maria_girls_percent
0 2018 47 6 2 7.00 14.43
1 2019 27 31 16 4.21 8.62
2 2020 33 21 11 5.24 10.79
3 2021 37 12 5 6.10 12.45
4 2022 19 47 22 3.21 6.52
-----------------------------------------------------------------------------------------------
Lleida
years Maria_abs_freq Maria_total_position Maria_girls_position Maria_total_percent Maria_girls_percent
0 2018 17 28 13 4.89 10.15
1 2019 23 8 3 6.69 13.71
2 2020 8 69 38 2.44 4.84
3 2021 17 19 7 5.30 11.06
4 2022 19 15 7 5.89 12.22
-----------------------------------------------------------------------------------------------
Tarragona
years Maria_abs_freq Maria_total_position Maria_girls_position Maria_total_percent Maria_girls_percent
0 2018 58 4 2 8.81 18.22
1 2019 49 6 4 7.60 15.85
2 2020 32 19 8 5.23 10.62
3 2021 30 26 12 5.00 10.33
4 2022 40 10 5 6.66 13.70
# Catalonia province with more Maria's born in 2022

Maria_abs_freq_2022 = dict()

for prov in Maria_df_dict.keys():
    Maria_abs_freq_2022[prov] = Maria_df_dict[prov].loc[Maria_df_dict[prov]['years'] == '2022', 'Maria_abs_freq'].iloc[0]

Maria_abs_freq_2022_values = np.array([x for x in Maria_abs_freq_2022.values()], dtype=int)
Maria_abs_freq_2022_index = np.array([x for x in Maria_abs_freq_2022.keys()])
prov_max_Maria_2022 = Maria_abs_freq_2022_index[np.argmax(Maria_abs_freq_2022_values)]
print('Catalonia province with more Maria\'s born in 2022 -->', prov_max_Maria_2022)
Catalonia province with more Maria's born in 2022 --> Barcelona

Plotting the results#

Maria_df_dict['Barcelona'] = Maria_df_dict['Barcelona'].drop([3], axis=0)

for prov in Maria_df_dict.keys() :
    for col_name in Maria_df_dict[prov].columns :
        if col_name in ['Maria_abs_freq', 'Maria_total_position', 'Maria_girls_position'] :
            Maria_df_dict[prov][col_name] = Maria_df_dict[prov][col_name].astype('int64')
        elif col_name in ['Maria_total_percent', 'Maria_girls_percent'] :
            Maria_df_dict[prov][col_name] = Maria_df_dict[prov][col_name].astype('float64')

provinces = Maria_df_dict.keys()
# Define the number of rows and columns for the matrix plot
num_cols = 2  # You can adjust the number of columns as needed
num_rows = int(np.ceil(len(provinces) / num_cols))

# Create a subplot with the specified number of rows and columns
fig, axes = plt.subplots(num_rows, num_cols, figsize=(10, 8))

# Flatten the axes array to make it easier to iterate
axes = axes.flatten()

colors = sns.color_palette("tab10", len(provinces))

# Loop through each 'geo' and create a subplot in the matrix
for i, color, prov in zip(range(0,len(provinces)), colors, provinces) :
    ax = axes[i]  # Get the current axis
    sns.barplot(x="years", y='Maria_abs_freq', data=Maria_df_dict[prov], color=color, ax=ax)
    ax.set_title(f"{prov}", fontsize=12)
    xticks_index = np.arange(0, len(Maria_df_dict[prov]), 1)
    ax.set_xticks(xticks_index)
    ax.tick_params(axis='x', rotation=0)
    ax.set_xlabel('Year')
    ax.set_ylabel('Count')

# Remove any unused subplots in case the number of 'geo' values is less than num_rows * num_cols
for j in range(len(provinces), num_rows * num_cols):
    fig.delaxes(axes[j])

plt.suptitle('Absolute Frequencies Maria by Catalonia provinces\n Last five years (2018-2022)', fontsize=15, y=0.98, weight='bold') # Establishing a general tittle for the plot.
plt.subplots_adjust(hspace=0.3, wspace=0.3) # Adjust vertical (hspace) and horizontal (wspace) spacing
# fig.savefig('Marias_Provinces_Catalonia' + '.jpg', format='jpg', dpi=550)
# plt.tight_layout()
plt.show()
_images/dd0445958d90daa6be4dd2790acd2bc4a513b220cb1074d82f65c0f24767815b.png

Maria’s analysis in Catalonia by area (region)#

Extracting and processing the data#

years = ['2018', '2019', '2020', '2021', '2022']
Maria_df_dict = dict()

for at_id in range(0, 8): # 8 = len(data_json['onomastica_nadons']['ff']['f']) - 1

    Maria_abs_freq, Maria_total_position, Maria_girls_position, Maria_total_percent, Maria_girls_percent = [], [], [], [], []

    for year in years :

        url = f'https://api.idescat.cat/onomastica/v1/nadons/dades.json?id=40683&class=at&t={year}&lang=es'
        response = requests.get(url)
        data_json = response.json() 

        Maria_abs_freq.append(data_json['onomastica_nadons']['ff']['f'][at_id]['pos1']['v']) # abs freq name Maria over the names of childs born in comarca {com_id} in {year}
        Maria_total_position.append(data_json['onomastica_nadons']['ff']['f'][at_id]['rank']['total']) # rank of name Maria over the names of childs born in comarca {com_id} in {year}
        Maria_girls_position.append(data_json['onomastica_nadons']['ff']['f'][at_id]['rank']['sex']) # rank of name Maria over the names of girls born in comarca {com_id} in {year}
        Maria_total_percent.append(data_json['onomastica_nadons']['ff']['f'][at_id]['pos1']['w']['total']) # % of childs born in comarca {com_id} in {year} whose name is Maria
        Maria_girls_percent.append(data_json['onomastica_nadons']['ff']['f'][at_id]['pos1']['w']['sex'])  # % of girls born in comarca {com_id} in {year} whose name is Maria

    Maria_dict = dict()
    Maria_dict['years'] = years
    Maria_dict['Maria_abs_freq'] = Maria_abs_freq
    Maria_dict['Maria_total_position'] = Maria_total_position
    Maria_dict['Maria_girls_position'] = Maria_girls_position
    Maria_dict['Maria_total_percent'] = Maria_total_percent
    Maria_dict['Maria_girls_percent'] = Maria_girls_percent  
    at = data_json['onomastica_nadons']['ff']['f'][at_id]['c']['content']
    Maria_df_dict[at] = pd.DataFrame(Maria_dict)
print('Maria\'s analysis in Catalonia by area (region)')

for at in Maria_df_dict.keys():
    print('-----------------------------------------------------------------------------------------------')
    print(at)
    display(Maria_df_dict[at])
Maria's analysis in Catalonia by area (region)
-----------------------------------------------------------------------------------------------
Metropolità
years Maria_abs_freq Maria_total_position Maria_girls_position Maria_total_percent Maria_girls_percent
0 2018 308 10 5 7.67 15.66
1 2019 246 15 7 6.38 13.01
2 2020 210 22 10 5.77 11.80
3 2021 211 20 8 5.77 11.86
4 2022 164 30 13 4.63 9.63
-----------------------------------------------------------------------------------------------
Comarques Gironines
years Maria_abs_freq Maria_total_position Maria_girls_position Maria_total_percent Maria_girls_percent
0 2018 47 6 2 7.11 14.67
1 2019 27 31 16 4.29 8.77
2 2020 33 21 11 5.30 10.90
3 2021 37 12 5 6.21 12.69
4 2022 19 44 20 3.27 6.65
-----------------------------------------------------------------------------------------------
Camp de Tarragona
years Maria_abs_freq Maria_total_position Maria_girls_position Maria_total_percent Maria_girls_percent
0 2018 39 5 3 8.72 18.03
1 2019 35 3 2 8.26 17.34
2 2020 18 28 13 4.42 8.89
3 2021 17 37 18 4.27 8.89
4 2022 30 5 3 7.60 15.50
-----------------------------------------------------------------------------------------------
Terres de l'Ebre
years Maria_abs_freq Maria_total_position Maria_girls_position Maria_total_percent Maria_girls_percent
0 2018 14 7 4 10.52 21.41
1 2019 11 9 6 8.12 16.69
2 2020 5 30 16 4.13 8.17
3 2021 10 12 6 8.26 16.34
4 2022 5 37 15 4.00 8.35
-----------------------------------------------------------------------------------------------
Ponent
years Maria_abs_freq Maria_total_position Maria_girls_position Maria_total_percent Maria_girls_percent
0 2018 14 31 13 4.65 9.60
1 2019 20 7 4 6.78 13.93
2 2020 7 68 34 2.46 4.87
3 2021 16 15 5 5.76 11.98
4 2022 16 15 8 5.71 11.85
-----------------------------------------------------------------------------------------------
Comarques Centrals
years Maria_abs_freq Maria_total_position Maria_girls_position Maria_total_percent Maria_girls_percent
0 2018 22 18 8 6.46 13.56
1 2019 29 7 2 8.73 17.91
2 2020 17 19 7 5.40 11.14
3 2021 17 23 11 5.52 11.09
4 2022 17 22 9 5.46 11.36
-----------------------------------------------------------------------------------------------
Alt Pirineu i Aran
years Maria_abs_freq Maria_total_position Maria_girls_position Maria_total_percent Maria_girls_percent
0 2018 _ _ _ _ _
1 2019 _ _ _ _ _
2 2020 _ _ _ _ _
3 2021 0 _ _ 0.00 0.00
4 2022 _ _ _ _ _
-----------------------------------------------------------------------------------------------
Penedès
years Maria_abs_freq Maria_total_position Maria_girls_position Maria_total_percent Maria_girls_percent
0 2018 20 23 12 5.38 11.22
1 2019 18 24 12 4.85 9.91
2 2020 27 4 3 7.59 15.86
3 2021 16 38 16 4.32 9.09
4 2022 15 38 18 4.26 8.87
# Catalonia region with more Maria\'s born in 2022

selected_at = [x for x in Maria_df_dict.keys() if x != 'Alt Pirineu i Aran']
Maria_abs_freq_2022 = dict()

for at in selected_at :
    Maria_abs_freq_2022[at] = Maria_df_dict[at].loc[Maria_df_dict[at]['years'] == '2022', 'Maria_abs_freq'].iloc[0]

Maria_abs_freq_2022_values = np.array([x for x in Maria_abs_freq_2022.values()], dtype=int)
Maria_abs_freq_2022_index = np.array([x for x in Maria_abs_freq_2022.keys()])
prov_max_Maria_2022 = Maria_abs_freq_2022_index[np.argmax(Maria_abs_freq_2022_values)]
print('Catalonia area with more Maria\'s born in 2022 -->', prov_max_Maria_2022)
Catalonia area with more Maria's born in 2022 --> Metropolità

Plotting the results#

for at in selected_at :
    for col_name in Maria_df_dict[at].columns :
        if col_name in ['Maria_abs_freq', 'Maria_total_position', 'Maria_girls_position'] :
            Maria_df_dict[at][col_name] = Maria_df_dict[at][col_name].astype('int64')
        elif col_name in ['Maria_total_percent', 'Maria_girls_percent'] :
            Maria_df_dict[at][col_name] = Maria_df_dict[at][col_name].astype('float64')

# Define the number of rows and columns for the matrix plot
num_cols = 2  # You can adjust the number of columns as needed
num_rows = int(np.ceil(len(selected_at) / num_cols))

# Create a subplot with the specified number of rows and columns
fig, axes = plt.subplots(num_rows, num_cols, figsize=(11, 10))

# Flatten the axes array to make it easier to iterate
axes = axes.flatten()

colors = sns.color_palette("tab10", len(selected_at))

# Loop through each 'geo' and create a subplot in the matrix
for i, color, at in zip(range(0,len(selected_at)), colors, selected_at) :
    ax = axes[i]  # Get the current axis
    sns.barplot(x="years", y='Maria_abs_freq', data=Maria_df_dict[at], color=color, ax=ax)
    ax.set_title(f"{at}", fontsize=13)
    xticks_index = np.arange(0, len(Maria_df_dict[at]), 1)
    ax.set_xticks(xticks_index)
    ax.tick_params(axis='x', rotation=0)
    ax.set_xlabel('Year')
    ax.set_ylabel('Count')

# Remove any unused subplots in case the number of 'geo' values is less than num_rows * num_cols
for j in range(len(selected_at), num_rows * num_cols):
    fig.delaxes(axes[j])

plt.suptitle('Absolute Frequencies Maria by Catalonia areas - Last five years (2018-2022)', fontsize=15, y=0.95, weight='bold') # Establishing a general tittle for the plot.
plt.subplots_adjust(hspace=0.8, wspace=0.3) # Adjust vertical (hspace) and horizontal (wspace) spacing
# fig.savefig('Marias_Areas_Catalonia' + '.jpg', format='jpg', dpi=550)
# plt.tight_layout()
plt.show()
_images/f256a7f7cbfde95d0e32ffb0dd77bba63c5e72af31a53f6fa5739090e1647e36.png