Extracting and analyzing data form Eurostat

Contents

Extracting and analyzing data form `Eurostat`#

Requirements#

import requests
import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid")

Extracting the data#

# Construct the URL according to Eurostat's API documentation specifics.
url = 'https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data/ei_bsco_m?format=JSON&unit=BAL&indic=BS-CSMCI&s_adj=SA&lang=en'
# Send a request to the Eurostat API
response = requests.get(url)
data_json = response.json()  

Processing the data#

indicator_dict = data_json['value']
geo_dict = data_json['dimension']['geo']['category']['index']
geo_dict_2 = data_json['dimension']['geo']['category']['label']
time_dict = data_json['dimension']['time']['category']['index']

n_time_ids = len(time_dict)
def indicator_id(geo_id, time_id):
    result = geo_id*n_time_ids + time_id
    return result

value_list = list()
geo_list = list()
time_list = list()
geo_values = [x for x in geo_dict.keys()]
time_values = [x for x in time_dict.keys()]
time_id_nan = data_json['extension']['positions-with-no-data']['time']

for geo_id in geo_dict.values() :
    for time_id in time_dict.values() :
        if time_id not in time_id_nan:

            value_id = str(indicator_id(geo_id, time_id))

            if value_id in indicator_dict.keys():

                value_list.append(indicator_dict[value_id])
                geo_list.append(geo_dict_2[geo_values[geo_id]])
                time_list.append(time_values[time_id])

value_geo_time_dict = dict()
value_geo_time_dict['indicator'] = value_list
value_geo_time_dict['geo'] = geo_list
value_geo_time_dict['date'] = time_list

df = pd.DataFrame(value_geo_time_dict)
df['geo'] = df['geo'].replace({'European Union - 27 countries (from 2020)' : 'EU'})
df_geo = dict()
df_geo_2022 = dict()
df_geo_last_years = dict()
avg_indicator_groupby_year = dict()
df_geo_last_years_2022 = dict()
for geo in df['geo'].unique() :
    df_geo[geo] = df.loc[df['geo'] == geo]
    df_geo_2022[geo] = df_geo[geo].loc[(df_geo[geo]['date'] >= '2022-01') & (df_geo[geo]['date'] <= '2022-12'),:]
    df_geo_last_years[geo] = df_geo[geo].loc[(df_geo[geo]['date'] >= '2017-01') & (df_geo[geo]['date'] <= '2021-12'),:]
    df_geo_last_years[geo].loc[:,'year'] = df_geo_last_years[geo]['date'].apply(lambda x : x[0:4])
    avg_indicator_groupby_year[geo] = df_geo_last_years[geo].groupby('year').mean('indicator').reset_index()
    df_geo_last_years_2022[geo] = pd.concat([avg_indicator_groupby_year[geo].rename(columns={'year':'date'}), df_geo_2022[geo]]).drop('geo', axis=1)
    avg_indicator_groupby_year[geo]  =  avg_indicator_groupby_year[geo].rename(columns={'indicator': 'avg_indicator'})

selected_geos = ['EU', 'Spain', 'Sweden', 'Italy', 'France', 'Portugal', 'Denmark', 'Germany']

print('---------------------------------------------------------------------------------------')

print('Selected Countries for the analysis: ', selected_geos)

print('--------------------------------------------------------------------------------------')

print('Just for getting and idea of how the dictionaries of data-frames generated from Eurostat looks like (they will be run only with EU and Spain, just for simplicity):')

print('---------------------------------------------------------------------------------------')

for geo in ['EU', 'Spain']:

    print(f'df_geo[{geo}]\n')
    print(df_geo[geo])
    print('---------------------------------------------------------------------------------------')
    print(f'df_geo_2022[{geo}]\n')
    print(df_geo_2022[geo])
    print('---------------------------------------------------------------------------------------')
    print(f'avg_indicator_groupby_year[{geo}]\n')
    print(avg_indicator_groupby_year[geo])
    print('---------------------------------------------------------------------------------------')
    print(f'df_geo_last_years_2022[{geo}]\n')
    print(df_geo_last_years_2022[geo])
    print('---------------------------------------------------------------------------------------')


print('---------------------------------------------------------------------------------------')


#################################################################

---------------------------------------------------------------------------------------
Selected Countries for the analysis:  ['EU', 'Spain', 'Sweden', 'Italy', 'France', 'Portugal', 'Denmark', 'Germany']
--------------------------------------------------------------------------------------
Just for getting and idea of how the dictionaries of data-frames generated from Eurostat looks like (they will be run only with EU and Spain, just for simplicity):
---------------------------------------------------------------------------------------
df_geo[EU]

     indicator geo     date
      -10.3  EU  1985-01
      -10.7  EU  1985-02
      -11.5  EU  1985-03
      -10.9  EU  1985-04
      -11.3  EU  1985-05
..         ...  ..      ...
    -17.5  EU  2023-11
    -15.9  EU  2023-12
    -16.2  EU  2024-01
    -15.8  EU  2024-02
    -15.2  EU  2024-03

[471 rows x 3 columns]
---------------------------------------------------------------------------------------
df_geo_2022[EU]

     indicator geo     date
    -10.8  EU  2022-01
    -10.7  EU  2022-02
    -22.6  EU  2022-03
    -23.2  EU  2022-04
    -22.8  EU  2022-05
    -25.3  EU  2022-06
    -28.3  EU  2022-07
    -26.1  EU  2022-08
    -29.7  EU  2022-09
    -28.4  EU  2022-10
    -25.0  EU  2022-11
    -23.5  EU  2022-12
---------------------------------------------------------------------------------------
avg_indicator_groupby_year[EU]

   year  avg_indicator
2017      -4.358333
2018      -4.391667
2019      -5.958333
2020     -14.508333
2021      -8.458333
---------------------------------------------------------------------------------------
df_geo_last_years_2022[EU]

        date  indicator
     2017  -4.358333
     2018  -4.391667
     2019  -5.958333
     2020 -14.508333
     2021  -8.458333
2022-01 -10.800000
2022-02 -10.700000
2022-03 -22.600000
2022-04 -23.200000
2022-05 -22.800000
2022-06 -25.300000
2022-07 -28.300000
2022-08 -26.100000
2022-09 -29.700000
2022-10 -28.400000
2022-11 -25.000000
2022-12 -23.500000
---------------------------------------------------------------------------------------
df_geo[Spain]

      indicator    geo     date
     -7.1  Spain  1986-06
     -7.2  Spain  1986-07
     -6.4  Spain  1986-08
     -7.2  Spain  1986-09
     -7.6  Spain  1986-10
...         ...    ...      ...
    -19.3  Spain  2023-11
    -18.5  Spain  2023-12
    -18.7  Spain  2024-01
    -17.3  Spain  2024-02
    -16.0  Spain  2024-03

[454 rows x 3 columns]
---------------------------------------------------------------------------------------
df_geo_2022[Spain]

      indicator    geo     date
    -13.3  Spain  2022-01
    -10.9  Spain  2022-02
    -29.3  Spain  2022-03
    -27.4  Spain  2022-04
    -23.7  Spain  2022-05
    -29.4  Spain  2022-06
    -36.2  Spain  2022-07
    -31.9  Spain  2022-08
    -32.0  Spain  2022-09
    -30.9  Spain  2022-10
    -27.6  Spain  2022-11
    -24.8  Spain  2022-12
---------------------------------------------------------------------------------------
avg_indicator_groupby_year[Spain]

   year  avg_indicator
2017      -2.900000
2018      -4.408333
2019      -6.366667
2020     -22.541667
2021     -12.866667
---------------------------------------------------------------------------------------
df_geo_last_years_2022[Spain]

         date  indicator
      2017  -2.900000
      2018  -4.408333
      2019  -6.366667
      2020 -22.541667
      2021 -12.866667
2022-01 -13.300000
2022-02 -10.900000
2022-03 -29.300000
2022-04 -27.400000
2022-05 -23.700000
2022-06 -29.400000
2022-07 -36.200000
2022-08 -31.900000
2022-09 -32.000000
2022-10 -30.900000
2022-11 -27.600000
2022-12 -24.800000
---------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------

Plotting the results#

# Define the number of rows and columns for the matrix plot
num_cols = 2  # You can adjust the number of columns as needed
num_rows = int(np.ceil(len(selected_geos) / num_cols))

# Create a subplot with the specified number of rows and columns
fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 10))

# Flatten the axes array to make it easier to iterate
axes = axes.flatten()

colors = sns.color_palette("tab10", len(selected_geos))

# Loop through each 'geo' and create a subplot in the matrix
for (i, geo), color in zip(enumerate(selected_geos), colors) :
    ax = axes[i]  # Get the current axis
    sns.lineplot(x="date", y="indicator", data=df_geo_last_years[geo], color=color, ax=ax)
    ax.set_title(f"{geo}", fontsize=13)
    xticks_index = np.arange(0, len(df_geo_last_years[geo]), 4)
    xticks_index = xticks_index.tolist()
    xticks_index.append(len(df_geo_last_years[geo])-1)
    ax.set_xticks(xticks_index)
    ax.tick_params(axis='x', rotation=90, labelsize=9)
    ax.set_xlabel('Date')
    ax.set_ylabel('Indicator')

# Remove any unused subplots in case the number of 'geo' values is less than num_rows * num_cols
for j in range(len(selected_geos), num_rows * num_cols):
    fig.delaxes(axes[j])

plt.suptitle('Monthly Time Series - Last five years (2017-2021)', fontsize=15, y=0.95, weight='bold') # Establishing a general tittle for the plot.
plt.subplots_adjust(hspace=1.5, wspace=0.25) # Adjust vertical (hspace) and horizontal (wspace) spacing
# fig.savefig('mothly_time_series_last_five_years' + '.jpg', format='jpg', dpi=550)
# plt.tight_layout()
plt.show()

_images/d1da8dedfaf985c524ef133aba4cdefe4ce662817f2415fee4d281ea66ea6f4a.png

# Define the number of rows and columns for the matrix plot
num_cols = 2  # You can adjust the number of columns as needed
num_rows = int(np.ceil(len(selected_geos) / num_cols))

# Create a subplot with the specified number of rows and columns
fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 10))

# Flatten the axes array to make it easier to iterate
axes = axes.flatten()

colors = sns.color_palette("tab10", len(selected_geos))

# Loop through each 'geo' and create a subplot in the matrix
for (i, geo), color in zip(enumerate(selected_geos), colors) :
    ax = axes[i]  # Get the current axis
    sns.barplot(x="date", y="indicator", data=df_geo_last_years_2022[geo], color=color, ax=ax)
    ax.set_title(f"{geo}", fontsize=13)
    xticks_index = np.arange(0, len(df_geo_last_years_2022[geo]), 1)
    ax.set_xticks(xticks_index)
    ax.tick_params(axis='x', rotation=90, labelsize=9)
    ax.set_xlabel('Date')
    ax.set_ylabel('Indicator')

# Remove any unused subplots in case the number of 'geo' values is less than num_rows * num_cols
for j in range(len(selected_geos), num_rows * num_cols):
    fig.delaxes(axes[j])

plt.suptitle('Barplot - Yearly (2017-2021) - Monthly (2022)', fontsize=15, y=0.95, weight='bold') # Establishing a general tittle for the plot.
plt.subplots_adjust(hspace=1.5, wspace=0.25) # Adjust vertical (hspace) and horizontal (wspace) spacing
# fig.savefig('barplot_last_five_years_and_2022' + '.jpg', format='jpg', dpi=550)
# plt.tight_layout()
plt.show()

_images/dc822a750336eba6ca12fefa34e0f8106ba79f9eb07e2f06f699a40befd213c1.png