Skip to content

Instantly share code, notes, and snippets.

@alekrutkowski
Last active April 29, 2025 08:02
Show Gist options
  • Save alekrutkowski/5b3e23e8e851857ab26c10e1ce286d6b to your computer and use it in GitHub Desktop.
Save alekrutkowski/5b3e23e8e851857ab26c10e1ce286d6b to your computer and use it in GitHub Desktop.
ESSPROS social protection expenditure real growth decomposition including early estimates
renameColumns <- function(dt, ...) {
pairs <-
substitute(list(...)) %>%
as.list %>%
tail(-1) %>%
lapply(. %>% as.list %>% tail(-1) %>% rev)
from <-
pairs %>%
sapply(. %>% .[[1]] %>% as.character)
to <-
pairs %>%
sapply(. %>% .[[2]] %>% as.character)
setnames(dt, from, to)
}
renameColumnsWithFunction <- function(dt, FUN)
setnames(dt,
colnames(dt),
colnames(dt) %>% sapply(FUN))
addUp <- function(dt, colnames.)
dt %>%
.[, paste0(colnames.,collapse="") :=
Reduce(`+`,
c(mget(colnames.)))] %>%
.[, (colnames.) := NULL]
v <- function(...) # to avoid typing single or double quotes
substitute(list(...)) %>%
as.list %>%
tail(-1) %>%
as.character
library(magrittr)
library(data.table)
library(openxlsx2)
library(rvest)
source('helper_functions.R')
write_xlsx <- function(x, file_name)
openxlsx2::write_xlsx(x, file_name, as_table=TRUE)
EU <-
v(AT,BE,BG,CY,CZ,DE,DK,EE,EL,ES,FI,FR,HR,HU,IE,
IT,LT,LU,LV,MT,NL,PL,PT,RO,SE,SI,SK,
EU27_2020)
# Import early estimates --------------------------------------------------
WEB_PAGE <-
"https://ec.europa.eu/eurostat/web/social-protection/database/early-estimates"
URL <-
WEB_PAGE %>%
read_html %>%
html_elements("a[href$='.xlsx']") %>% # Select all <a> tags with href ending in .xlsx
html_attr("href") %>% # Extract the href attribute (the link itself)
url_absolute(WEB_PAGE) # Handle relative URLs by converting them to absolute ones if needed
if (is.character(URL) && length(URL)==1 && URL!="")
message('Found Excel file URL:\n',URL) else
stop('Could not find URL of Excel file in the early estimates web page!')
LATEST_YEAR <-
read_xlsx(URL, sheet="MIO_NAC", rows=2, cols=6,
col_names=FALSE) %>%
as.integer
addUpColumns <- function(dt)
dt %>%
addUp(v(HOUSE,EXCLU)) %>%
addUp(v(OLD,SURVIV)) %>%
addUp(v(SICK,DISA)) # %>% "other" too small to report
# .[, OTHER :=
# SPBENEFNOREROUTE - HOUSEEXCLU - OLDSURVIV - SICKDISA - FAM - UNEMPLOY]
early_estim <-
URL %>%
read_xlsx(sheet="MIO_NAC", start_row=4, cols=1:10) %>%
set_colnames(colnames(.) %>%
ifelse(is.na(.),'geo_labels',.)) %>%
as.data.table %>%
.[geo_labels != 'EU27_2020'] %>% # empty EU27
rbind(
URL %>%
read_xlsx(sheet="MIO_EUR", cols=1:10, rows=4:5) %>% # not empty EU27
set_colnames(colnames(.) %>%
ifelse(is.na(.),'geo_labels',.))
) %>%
.[, geo_labels := geo_labels %>%
ifelse(grepl('EU27_2020',.),'European Union - 27 countries (from 2020)',.)] %>%
renameColumnsWithFunction(. %>%
trimws %>%
gsub('\n',"",.,fixed=TRUE) %>%
ifelse(grepl('TOTAL',.),'SPBENEFNOREROUTE',.)) %>%
renameColumns("Sickness/Health care" -> SICK,
Disability -> DISA,
"Old age" -> OLD,
Survivors -> SURVIV,
"Family/Children" -> FAM,
Unemployment -> UNEMPLOY,
Housing -> HOUSE,
"Social exclusion n.e.c." -> EXCLU) %>%
addUpColumns %>%
melt(id.vars='geo_labels', variable.name='spdeps',
value.name='value_', variable.factor=FALSE) %>%
.[, time := LATEST_YEAR] %>%
merge(importLabels('geo'), by='geo_labels') %>%
.[, geo_labels := NULL] %>%
.[geo %in% EU] %T>%
write_xlsx('early_estim.xlsx')
# Import time series for expenditure --------------------------------------
expend <-
importData('spr_exp_sum',
list(unit='MIO_NAC',
spdeps=v(SPBENEFNOREROUTE, SICK, DISA,
OLD, SURVIV, FAM,
UNEMPLOY, HOUSE, EXCLU))) %>%
as.data.table %>%
.[geo != 'EU27_2020'] %>%
rbind(
importData('spr_exp_sum',
list(unit='MIO_EUR',
geo='EU27_2020',
spdeps=v(SPBENEFNOREROUTE, SICK, DISA,
OLD, SURVIV, FAM,
UNEMPLOY, HOUSE, EXCLU)))
) %T>%
write_xlsx('spr_exp_sum.xlsx') %>%
.[, v(flags_,freq,unit) := NULL] %>%
renameColumns(TIME_PERIOD -> time) %>%
.[, time := time %>% as.character %>% as.integer] %>%
.[geo %in% EU] %>%
dcast(geo + time ~ spdeps,
fun.aggregate=identity,
value.var='value_',
fill=NA_real_) %>%
addUpColumns %>%
melt(id.vars=v(geo,time), variable.name='spdeps',
value.name='value_', variable.factor=FALSE)
stacked_expend <-
rbind(expend, early_estim, fill=TRUE)
# Import HICP -------------------------------------------------------------
hicp_data <-
importData('prc_hicp_aind',
list(coicop='CP00', # All-items HICP
unit='INX_A_AVG' # Annual average index
)) %T>%
write_xlsx('prc_hicp_aind.xlsx') %>%
as.data.table %>%
.[, hicp := value_/100] %>%
.[, v(flags_,freq,coicop,unit,value_) := NULL] %>%
renameColumns(TIME_PERIOD -> time) %>%
.[, time := time %>% as.character %>% as.integer]
# Real growth decomposition -----------------------------------------------
DATA <-
merge(stacked_expend, hicp_data,
by=v(geo,time)) %T>%
write_xlsx('DATA.xlsx')
decomposeGrowth <- function(dt, suffix)
copy(dt) %>%
.[, real_value := value_/hicp] %>%
.[, D_real_value := collapse::D(real_value, t=time)
, by = .(geo,spdeps)] %>%
.[, D_real_value_TOTAL :=
D_real_value[spdeps=='SPBENEFNOREROUTE']
, by=.(geo,time)] %>% # across spdeps components
.[, D_real_value_percent := 100*D_real_value/collapse::L(real_value, t=time)
, by = .(geo,spdeps)] %>%
.[, D_real_value_percent_TOTAL :=
D_real_value_percent[spdeps=='SPBENEFNOREROUTE']
, by=.(geo,time)] %>% # across spdeps components
.[, D_real_value_percentage_points :=
ifelse(D_real_value_TOTAL==0, 0,
D_real_value_percent_TOTAL * D_real_value / D_real_value_TOTAL)] %>%
.[, spdeps := spdeps %>% ifelse(.=='SPBENEFNOREROUTE','TOTAL',.)] %T>%
write_xlsx(paste0('decomposeGrowth',suffix,'.xlsx'))
componentsToColumns <- function(dt)
dt %>%
dcast(geo + time ~ spdeps,
fun.aggregate=identity,
value.var='D_real_value_percentage_points',
fill=NA_real_) %>%
setcolorder('TOTAL',after=ncol(.))
EU_Agg <-
DATA %>%
.[geo=='EU27_2020'] %>%
decomposeGrowth('__EU_Agg') %>%
componentsToColumns
MS_LongTerm <-
DATA %>%
.[geo != 'EU27_2020'] %>%
.[time==LATEST_YEAR | time==LATEST_YEAR-6] %>%
.[, time := time %>% ifelse(.!=max(.), max(.)-1, .)] %>%
decomposeGrowth('__MS_LongTerm') %>%
componentsToColumns %>%
.[time==LATEST_YEAR] %>%
.[, time := NULL] %>%
setorder(-TOTAL)
MS_ShortTerm <-
DATA %>%
.[geo != 'EU27_2020'] %>%
.[time==LATEST_YEAR | time==LATEST_YEAR-1] %>%
decomposeGrowth('__MS_ShortTerm') %>%
componentsToColumns %>%
.[time==LATEST_YEAR] %>%
.[, time := NULL] %>%
setorder(-TOTAL)
Data <-
DATA %>%
decomposeGrowth('__Full_Data') %>%
.[, D_nominal_value := collapse::D(value_, t=time)
, by = .(geo,spdeps)] %>%
.[, D_nominal_value_percent := 100*D_nominal_value/collapse::L(value_, t=time)
, by = .(geo,spdeps)] %>%
renameColumns(value_ -> Value_in_mln_EUR,
real_value -> Real_value_in_constant_prices,
D_nominal_value_percent -> Nominal_growth_in_percent,
D_real_value_percent -> Real_growth_in_percent,
hicp -> HICP_index) %>%
.[, .(geo, time, spdeps,
Value_in_mln_EUR, Real_value_in_constant_prices,
Nominal_growth_in_percent, Real_growth_in_percent, HICP_index)] %>%
.[, spdeps := spdeps %>% ifelse(.=='TOTAL','_TOTAL_',.)] %>%
setorder(geo, time, spdeps)
list(EU_Agg, MS_LongTerm, MS_ShortTerm, Data) %>%
set_names(c('EU_Agg',
paste('MS',LATEST_YEAR-6,LATEST_YEAR,sep='_'),
paste('MS',LATEST_YEAR-1,LATEST_YEAR,sep='_'),
'Data')) %>%
write_xlsx('ESSPROS social protection expenditure decomposition with early estimates.xlsx')
library(magrittr)
library(data.table)
library(openxlsx2)
library(rvest)
library(eurodata)
setwd('C:/Users/rutkoal/OneDrive - European Commission/ESSPROS social protection expenditure decomposition with early estimates')
source('helper_functions.R')
write_xlsx <- function(x, file_name)
openxlsx2::write_xlsx(x, file_name, as_table=TRUE)
EU <-
v(AT,BE,BG,CY,CZ,DE,DK,EE,EL,ES,FI,FR,HR,HU,IE,
IT,LT,LU,LV,MT,NL,PL,PT,RO,SE,SI,SK,
EU27_2020)
# Import early estimates --------------------------------------------------
# WEB_PAGE <-
# "https://ec.europa.eu/eurostat/web/social-protection/database/early-estimates"
#
# URL <-
# WEB_PAGE %>%
# read_html %>%
# html_elements("a[href$='.xlsx']") %>% # Select all <a> tags with href ending in .xlsx
# html_attr("href") %>% # Extract the href attribute (the link itself)
# url_absolute(WEB_PAGE) # Handle relative URLs by converting them to absolute ones if needed
#
# if (is.character(URL) && length(URL)==1 && URL!="")
# message('Found Excel file URL:\n',URL) else
# stop('Could not find URL of Excel file in the early estimates web page!')
#
# LATEST_YEAR <-
# read_xlsx(URL, sheet="MIO_NAC", rows=2, cols=6,
# col_names=FALSE) %>%
# as.integer
addUpColumns <- function(dt)
dt %>%
addUp(v(HOU,EXCL)) %>%
addUp(v(OLD,SRV)) %>%
addUp(v(SICK,DIS)) # %>% "other" too small to report
# .[, OTHER :=
# TOTAL - HOUSEEXCLU - OLDSURVIV - SICKDISA - FAM - UNE]
# early_estim <-
# URL %>%
# read_xlsx(sheet="MIO_NAC", start_row=4, cols=1:10) %>%
# set_colnames(colnames(.) %>%
# ifelse(is.na(.),'geo_labels',.)) %>%
# as.data.table %>%
# .[geo_labels != 'EU27_2020'] %>% # empty EU27
# rbind(
# URL %>%
# read_xlsx(sheet="MIO_EUR", cols=1:10, rows=4:5) %>% # not empty EU27
# set_colnames(colnames(.) %>%
# ifelse(is.na(.),'geo_labels',.))
# ) %>%
# .[, geo_labels := geo_labels %>%
# ifelse(grepl('EU27_2020',.),'European Union - 27 countries (from 2020)',.)] %>%
# renameColumnsWithFunction(. %>%
# trimws %>%
# gsub('\n',"",.,fixed=TRUE) %>%
# ifelse(grepl('TOTAL',.),'TOTAL',.)) %>%
# renameColumns("Sickness/Health care" -> SICK,
# Disability -> DIS,
# "Old age" -> OLD,
# Survivors -> SRV,
# "Family/Children" -> FAM,
# Unemployment -> UNE,
# Housing -> HOU,
# "Social exclusion n.e.c." -> EXCL) %>%
# addUpColumns %>%
# melt(id.vars='geo_labels', variable.name='spfunc',
# value.name='value_', variable.factor=FALSE) %>%
# .[, time := LATEST_YEAR] %>%
# merge(importLabels('geo'), by='geo_labels') %>%
# .[, geo_labels := NULL] %>%
# .[geo %in% EU] %T>%
# write_xlsx('early_estim.xlsx')
# Import time series for expenditure --------------------------------------
expend <-
importData('spr_exp_func',
list(unit='MIO_NAC',
spfunc=v(TOTAL, SICK, DIS,
OLD, SRV, FAM,
UNE, HOU, EXCL))) %>%
as.data.table %>%
.[geo != 'EU27_2020'] %>%
rbind(
importData('spr_exp_func',
list(unit='MIO_EUR',
geo='EU27_2020',
spfunc=v(TOTAL, SICK, DIS,
OLD, SRV, FAM,
UNE, HOU, EXCL)))
) %T>%
write_xlsx('spr_exp_func.xlsx') %>%
.[, v(flags_,freq,unit) := NULL] %>%
renameColumns(TIME_PERIOD -> time) %>%
.[, time := time %>% as.character %>% as.integer] %>%
.[geo %in% EU] %>%
dcast(geo + time ~ spfunc,
fun.aggregate=identity,
value.var='value_',
fill=NA_real_) %>%
addUpColumns %>%
melt(id.vars=v(geo,time), variable.name='spfunc',
value.name='value_', variable.factor=FALSE)
LATEST_YEAR <-
expend %>%
.[, max(time)]
# stacked_expend <-
# rbind(expend, early_estim, fill=TRUE)
# Import HICP -------------------------------------------------------------
hicp_data <-
importData('prc_hicp_aind',
list(coicop='CP00', # All-items HICP
unit='INX_A_AVG' # Annual average index
)) %T>%
write_xlsx('prc_hicp_aind.xlsx') %>%
as.data.table %>%
.[, hicp := value_/100] %>%
.[, v(flags_,freq,coicop,unit,value_) := NULL] %>%
renameColumns(TIME_PERIOD -> time) %>%
.[, time := time %>% as.character %>% as.integer]
# Real growth decomposition -----------------------------------------------
DATA <-
merge(expend, hicp_data,
by=v(geo,time)) %T>%
write_xlsx('DATA.xlsx')
decomposeGrowth <- function(dt, suffix)
copy(dt) %>%
.[, real_value := value_/hicp] %>%
.[, D_real_value := collapse::D(real_value, t=time)
, by = .(geo,spfunc)] %>%
.[, D_real_value_TOTAL :=
D_real_value[spfunc=='TOTAL']
, by=.(geo,time)] %>% # across spfunc components
.[, D_real_value_percent := 100*D_real_value/collapse::L(real_value, t=time)
, by = .(geo,spfunc)] %>%
.[, D_real_value_percent_TOTAL :=
D_real_value_percent[spfunc=='TOTAL']
, by=.(geo,time)] %>% # across spfunc components
.[, D_real_value_percentage_points :=
ifelse(D_real_value_TOTAL==0, 0,
D_real_value_percent_TOTAL * D_real_value / D_real_value_TOTAL)] %>%
.[, spfunc := spfunc %>% ifelse(.=='TOTAL','TOTAL',.)] %T>%
write_xlsx(paste0('decomposeGrowth',suffix,'.xlsx'))
componentsToColumns <- function(dt)
dt %>%
dcast(geo + time ~ spfunc,
fun.aggregate=identity,
value.var='D_real_value_percentage_points',
fill=NA_real_) %>%
setcolorder('TOTAL',after=ncol(.))
EU_Agg <-
DATA %>%
.[geo=='EU27_2020'] %>%
decomposeGrowth('__EU_Agg') %>%
componentsToColumns
MS_LongTerm <-
DATA %>%
.[geo != 'EU27_2020'] %>%
.[time==LATEST_YEAR | time==LATEST_YEAR-6] %>%
.[, time := time %>% ifelse(.!=max(.), max(.)-1, .)] %>%
decomposeGrowth('__MS_LongTerm') %>%
componentsToColumns %>%
.[time==LATEST_YEAR] %>%
.[, time := NULL] %>%
setorder(-TOTAL)
MS_ShortTerm <-
DATA %>%
.[geo != 'EU27_2020'] %>%
.[time==LATEST_YEAR | time==LATEST_YEAR-1] %>%
decomposeGrowth('__MS_ShortTerm') %>%
componentsToColumns %>%
.[time==LATEST_YEAR] %>%
.[, time := NULL] %>%
setorder(-TOTAL)
Data <-
DATA %>%
decomposeGrowth('__Full_Data') %>%
.[, D_nominal_value := collapse::D(value_, t=time)
, by = .(geo,spfunc)] %>%
.[, D_nominal_value_percent := 100*D_nominal_value/collapse::L(value_, t=time)
, by = .(geo,spfunc)] %>%
renameColumns(value_ -> Value_in_mln_EUR,
real_value -> Real_value_in_constant_prices,
D_nominal_value_percent -> Nominal_growth_in_percent,
D_real_value_percent -> Real_growth_in_percent,
hicp -> HICP_index) %>%
.[, .(geo, time, spfunc,
Value_in_mln_EUR, Real_value_in_constant_prices,
Nominal_growth_in_percent, Real_growth_in_percent, HICP_index)] %>%
.[, spfunc := spfunc %>% ifelse(.=='TOTAL','_TOTAL_',.)] %>%
setorder(geo, time, spfunc)
list(EU_Agg, MS_LongTerm, MS_ShortTerm, Data) %>%
set_names(c('EU_Agg',
paste('MS',LATEST_YEAR-6,LATEST_YEAR,sep='_'),
paste('MS',LATEST_YEAR-1,LATEST_YEAR,sep='_'),
'Data')) %>%
write_xlsx('ESSPROS social protection expenditure decomposition.xlsx')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment