# Packages
pacman::p_load(
here,tidyverse,tidymodels,reactable,thematic,scales,arrow
)
results <- here("Tax_and_pension_progressivity","Processed_data")
data_dir <- here("Raw_data")
# Years
sourceyears = c(1991:2019)
# Graphics
theme_set(theme_bw(base_family = "serif"))
thematic_on(
bg = "white", fg = "black",
accent = "auto",
qualitative = okabe_ito()
)
# This will be needed later when we check the accuracy of our parametric estimates
f_indices <-
function(X){
# Function to calculate area under curve
f_area <-
function(x,y){
h = x-lag(x)
h = ifelse(is.na(h),0,h)
ab = y+lag(y)
ab = ifelse(is.na(ab),y,ab)
L = sum(0.5*h*ab)
K = 0.5
S = 1-(L/K)
}
# Function to caclculate indices
I <-
X %>%
# Restrict sample
filter(x>=0&y>=0&t>=0) %>%
select(year,x,y,t) %>%
# Sample stats
mutate(
across(
c('x','y','t'),
list(Mean = mean, Tot = sum),
.names = "{.fn}_{.col}"
)
) %>%
# Average size
mutate(
size = Tot_t/Tot_x
) %>%
# Shares
mutate(
px = x/Tot_x, py = y/Tot_y, pt = t/Tot_t, num=1,
obs =1/sum(num)
) %>%
# Distribution wrt x (Prefix 'X_')
arrange(x) %>%
mutate(
across(
c('px','py','pt','obs'),
cumsum,
.names = "X_{.col}"
),
# Indices (covariance formula)
Gx_cov = (2/Mean_x)*(cov(x,X_obs)), # Gini coef
Ct = (2/Mean_t)*(cov(t,X_obs)),
St = (2/Mean_t)*(cov(t,X_px)),
Cy_cov = (2/Mean_y)*(cov(y,X_obs)),
K_cov = Ct - Gx_cov,
S_cov = St - Gx_cov,
# Indices (area formula)
Gx_area = f_area(X_obs,X_px),
Cy_area = f_area(X_obs,X_py),
Ct = f_area(X_obs,X_pt),
S_area = f_area(X_px,X_pt),
K_area = Ct - Gx_area
) %>%
# Redistributive effect
arrange(y) %>%
mutate(
across(
c('px','py','pt','obs'),
cumsum,
.names = "Y_{.col}"
),
Gy_area = f_area(Y_obs,Y_py),
Gy_cov = (2/Mean_y)*(cov(y,Y_obs)),
R_area = Gx_area - Gy_area,
R_cov = Gx_cov - Gy_cov,
Rerank_area = Gy_area - Cy_area,
Rerank_cov = Gy_cov - Cy_cov,
) %>%
select(year,size,ends_with('area')|ends_with('cov')) %>%
filter(row_number()==1)
}
This appendix explores in detail the estimation of the parametric tax function used in this paper. The estimation of parameters is non-trivial. Parameter values are highly sensitive to the sample that is used in the estimation. In summary, we find that the parametric function does not provide a good fit on raw tax data. However, the fit significantly improves once we restrict the sample within the bounds of the statutory bottom and top income thresholds and top tax rates for each respective year.
Most importantly, we find that including incomes below the tax-free threshold results in over-estimating average tax rates at the bottom and under-estimating tax rates at the top. In turn, the progressivity of the tax code itself is under-estimated. Thus, incomes below the tax-free threshold needs to be excluded while estimating the parametric tax function.
Let \(x\) be post-tax income and \(y\) be some notion of pre-tax income (we shall explore different income concepts). The parametric function maps from \(y\) to \(x\) as
\[ x=\lambda y^{1-\tau} \]
Income tax is given by
\[ T=y-\lambda y^{1-\tau} \]
The average tax rate is
\[ t=1-\lambda y^{-\tau} \]
We estimate \(\tau\) and \(\lambda\) by two methods - ordinary least squares (OLS) and non-linear least squares (NLS).
# 1.0 OLS (lm)
f_tax_lm <-
function(X){
# Take the natural log of the tax function
X <-
X %>%
filter(x>0&y>0) %>%
mutate(log_x = log(x), log_y=log(y))
# Estimate the model
lm_fit <-
linear_reg() %>%
set_engine('lm') %>%
set_mode('regression') %>%
fit(log_x~log_y,data=X)
# Collect the results
Metrics <-
glance(lm_fit) %>%
select(adj.r.squared,sigma,AIC,BIC,nobs) %>%
rename(RMSE=sigma,Rsquared=adj.r.squared)
Coefs <-
tidy(lm_fit) %>%
mutate(
term = if_else(term=="(Intercept)","lambda","tau"),
value = if_else(term=="lambda",exp(estimate),1-estimate),
ci1 = value - 1.96*std.error,
ci2 = value + 1.96*std.error
) %>%
select(term,value,ci1,ci2) %>%
bind_cols(.,Metrics) %>%
mutate(year=X$year[1]) %>%
relocate(year)
}
# 2.0 NLS
f_tax_nls <-
function(X){
nlsfit <-
nls(atr~1-lambda*y^(-tau),X, start = list(lambda=1,tau=0.1))
Metrics <-
glance(nlsfit) %>%
select(sigma,AIC,BIC,nobs) %>%
rename(RMSE=sigma)
Coefs <-
tidy(nlsfit) %>%
mutate(
ci1 = estimate - 1.96*std.error,
ci2 = estimate + 1.96*std.error
) %>%
select(term,estimate,ci1,ci2) %>%
bind_cols(.,Metrics) %>%
mutate(year=X$year[1]) %>%
relocate(year) %>%
rename(value=estimate)
}
We take taxable income “ic_taxable_income_loss” as pre-tax income, net tax after offsets and credits “tc_net_tax” as our tax variable. Both income and tax are adjusted for inflation using the CPI and expressed in 2019 dollars.
varz <- c("ic_taxable_income_loss","tc_net_tax")
CPI <-
read_csv(here(data_dir,"CPI.csv"),show_col_types = F)
f_nominal_to_real <-
function(X,varz_to_real,CPI,i){
X %>%
mutate(cpi=CPI$cpi[i],
across(
all_of(varz_to_real),
~(./cpi)
))
}
Prior to estimating the tax function on the data, we begin by examining the progressivity of the income tax code between 1991-2019. The Australian income tax code is complex with multiple thresholds, marginal tax rates and various tax credits, allowances, deductions and offsets that have changed considerably over the past two decades.
Tracking all these changes to the statutory tax code is a challenging task. Thus, our approach is to focus on the changes to the standard marginal tax rates and thresholds from available public information. We shall capture detailed changes in later sections when we estimate the tax function on the data.
Tax_code_data <-
read_csv(here("Raw_data","Tax_rates.csv"),
show_col_types = FALSE)
Tax_code_data_real <-
full_join(Tax_code_data,CPI,by="year") %>%
mutate(
across(
c(y1,y2,tax),
~(./cpi)
)
)
The following function estimates tax liability using the tax code data.
f_estimate_tax <-
function(X,TD){
# First getting the data into an easier format
## TD - tax code data (can be either nominal or real)
## X - df with raw data (year,inc,tax)
ty <-
TD %>%
select(year,y1) %>%
group_by(year) %>%
mutate(y=paste0("y",row_number())) %>%
pivot_wider(names_from = "y", values_from = "y1")
tr <-
TD %>%
select(year,mtr) %>%
group_by(year) %>%
mutate(mtr=mtr/100,
y=paste0("mtr",row_number())) %>%
pivot_wider(names_from = "y", values_from = "mtr")
tx <-
TD %>%
select(year,tax) %>%
group_by(year) %>%
mutate(
y=paste0("tx",row_number())) %>%
pivot_wider(names_from = "y", values_from = "tax")
Tax_code <-
left_join(ty,tr,by="year") %>%
left_join(.,tx,by="year")
# The following estimates tax liability and selects the right MTR
A <-
left_join(X,Tax_code,by="year") %>%
arrange(y) %>%
mutate(
tax = 0,
tax = if_else(y>=y1 & y<y2 , tx1 + (y-y1+1)*mtr1, tax),
tax = if_else(y>=y2 & y<y3 , tx2 + (y-y2+1)*mtr2, tax),
tax = if_else(y>=y3 & y<y4 , tx3 + (y-y3+1)*mtr3, tax),
tax = if_else(y>=y4 & y<y5 , tx4 + (y-y4+1)*mtr4, tax),
tax = if_else(y>=y5 & year!=1991 & year!=1994,
tx5 + (y-y5+1)*mtr5, tax
),
tax = if_else(
year==1991,
case_when(
y>=y1 & y<y2 ~ tx1 + (y-y1+1)*mtr1,
y>=y2 & y<y3 ~ tx2 + (y-y2+1)*mtr2,
y>=y3 & y<y4 ~ tx3 + (y-y3+1)*mtr3,
y>=y4 & y<y5 ~ tx4 + (y-y4+1)*mtr4,
y>=y5 & y<y6 ~ tx5 + (y-y5+1)*mtr5,
y>=y6 & y<y7 ~ tx6 + (y-y6+1)*mtr6,
y>=y7 & y<y8 ~ tx7 + (y-y7+1)*mtr7,
y>=y8 ~ tx4 + (y-y8+1)*mtr8,
),
tax
),
tax = if_else(
year==1994,
case_when(
y>=y1 & y<y2 ~ tx1 + (y-y1+1)*mtr1,
y>=y2 & y<y3 ~ tx2 + (y-y2+1)*mtr2,
y>=y3 & y<y4 ~ tx3 + (y-y3+1)*mtr3,
y>=y4 & y<y5 ~ tx4 + (y-y4+1)*mtr4,
y>=y5 & y<y6 ~ tx5 + (y-y5+1)*mtr5,
y>=y6 ~ tx6 + (y-y6+1)*mtr6,
),
tax
),
mtr=0,
mtr =
if_else(
year==1991,
case_when(
y>=y1 & y<y2 ~ mtr1,
y>=y2 & y<y3 ~ mtr2,
y>=y3 & y<y4 ~ mtr3,
y>=y4 & y<y5 ~ mtr4,
y>=y5 & y<y6 ~ mtr5,
y>=y6 & y<y7 ~ mtr6,
y>=y7 & y<y8 ~ mtr7,
y>=y8 ~ mtr8,
),
mtr),
mtr =
if_else(
year==1994,
case_when(
y>=y1 & y<y2 ~ mtr1,
y>=y2 & y<y3 ~ mtr2,
y>=y3 & y<y4 ~ mtr3,
y>=y4 & y<y5 ~ mtr4,
y>=y5 & y<y6 ~ mtr5,
y>=y6 ~ mtr6,
),
mtr),
mtr =
if_else(
year!=1991&year!=1994,
case_when(
y>=y1 & y<y2 ~ mtr1,
y>=y2 & y<y3 ~ mtr2,
y>=y3 & y<y4 ~ mtr3,
y>=y4 & y<y5 ~ mtr4,
y>=y5 ~ mtr5
),
mtr
)
) %>%
rename(taxfree=y2) %>%
select(pid,year,y,tax,mtr,taxfree)
}
Z <-
Tax_code_data %>%
mutate(
across(
c(y1,y2,tax),
~format(.,big.mark=",")
),
mtr = round(mtr,digits = 2)
) %>%
rename(
Year=year,
From=y1,
To=y2,
`Tax amount`=tax,
`Marginal tax rate (%)`=mtr
)
htmltools::tagList(
reactable(
Z,
defaultColDef =
colDef(
align = "center"
),
bordered = T,
highlight = T,
filterable = T
)
)
Z <-
Tax_code_data_real %>%
mutate(
across(
c(y1,y2,tax,mtr),
~round(.,digits = 2)
),
across(
c(y1,y2,tax),
~format(.,big.mark=",")
),
) %>%
rename(
Year=year,
From=y1,
To=y2,
`Tax amount`=tax,
`Marginal tax rate (%)`=mtr
) %>%
select(-cpi)
htmltools::tagList(
reactable(
Z,
defaultColDef =
colDef(
align = "center"
),
bordered = T,
highlight = T,
filterable = T
)
)
We estimate statutory tax and the average tax rate for each year.
for (i in seq_along(sourceyears)) {
Z <-
seq(0,200000,by=100) %>%
as_tibble() %>%
rename(y=value) %>%
mutate(pid=1, # Artefact in function
year=sourceyears[i]) %>%
f_estimate_tax(.,Tax_code_data_real) %>%
mutate(atr=if_else(y==0,0,tax/y))
if (i==1){
X=Z
}
else{
X=bind_rows(X,Z)
}
}
X %>%
filter(year==1995|year==2000|year==2005|year==2010|year==2019) %>%
mutate(Year=as.factor(year),
atr=atr*100) %>%
ggplot(aes(x=y,y=atr,color=Year,shape=Year)) +
geom_line() +
labs(x="Real Income (2019$)", y="Average tax rate (%)") +
scale_x_continuous(label=comma, breaks=seq(0,200000,20000))
ggsave(
here(results,"TP_01_ATR_Statutory.pdf")
)
Saving 6.88 x 4.25 in image
NA
We begin by exploring the difference between statutory average tax rates caculated using the tax code and those estimated using the parametric tax function.
Estimating the tax function for the entire range of income between 0 and 200,000 including incomes below the tax-free threshold results over-estimating average tax rates at the bottom and under-estimating average tax rates at the top.
Z <-
seq(0,200000,by=1000) %>%
as_tibble() %>%
rename(y=value) %>%
mutate(pid=1, # Artefact in function
year=2019) %>%
f_estimate_tax(.,Tax_code_data_real) %>%
mutate(atr=if_else(y==0,0,tax/y),
x=y-tax)
X <-
f_tax_lm(Z)
Z <-
Z %>%
mutate(
OLS = (1-X$value[1]*y^(-(X$value[2])))*100
) %>%
mutate(atr=atr*100)
Z %>%
ggplot(aes(x=y)) +
geom_line(aes(y=atr,color="Data"),show.legend = T) +
geom_line(aes(y=OLS,color="Estimated"), show.legend = T) +
labs(x="Income",y="Average tax rate",
title="Including incomes below tax-free threshold") +
scale_color_manual(
name = "ATR",
guide = 'legend',
values = c("Data"="blue","Estimated"="red")
) +
scale_x_continuous(label=comma, breaks=seq(0,200000,20000))
ggsave(
here(results,"TP_01_ATR_TaxFunc_Including_below_tax_free.pdf")
)
Saving 6.88 x 4.25 in image
Excluding incomes below the tax-free threshold (by restricting the average tax rate to strictly positive values) significantly improves the fit of the estimated tax function.
Z <-
seq(0,200000,by=1000) %>%
as_tibble() %>%
rename(y=value) %>%
mutate(pid=1, # Artefact in function
year=2019) %>%
f_estimate_tax(.,Tax_code_data_real) %>%
mutate(atr=if_else(y==0,0,tax/y),
x=y-tax) %>%
filter(atr>0)
X <-
f_tax_lm(Z)
Z <-
Z %>%
mutate(
OLS = (1-X$value[1]*y^(-(X$value[2])))*100
) %>%
mutate(atr=atr*100)
Z %>%
ggplot(aes(x=y)) +
geom_line(aes(y=atr,color="Data"),show.legend = T) +
geom_line(aes(y=OLS,color="Estimated"), show.legend = T) +
labs(x="Income",y="Average tax rate",
title="Excluding incomes below tax-free threshold") +
scale_color_manual(
name = "ATR",
guide = 'legend',
values = c("Data"="blue","Estimated"="red")
)
ggsave(
here(results,"TP_01_ATR_TaxFunc_Excluding_below_tax_free.pdf")
)
Saving 6.88 x 4.25 in image
We estimate the progressivity parameter \(\tau\) on the standard income tax code from 1991 to 2019.
Z <-
seq(0,200000,by=1000) %>%
as_tibble() %>%
rename(y=value) %>%
mutate(pid=1, # Artefact in function
year=2019) %>%
f_estimate_tax(.,Tax_code_data_real) %>%
mutate(atr=if_else(y==0,0,tax/y),
x=y-tax)
X <-
f_tax_lm(Z)
Z <-
Z %>%
mutate(
OLS = (1-X$value[1]*y^(-(X$value[2])))*100
) %>%
mutate(atr=atr*100)
Z %>%
ggplot(aes(x=y)) +
geom_line(aes(y=atr,color="Data"),show.legend = T) +
geom_line(aes(y=OLS,color="Estimated"), show.legend = T) +
labs(x="Income",y="Average tax rate",
title="Including incomes below tax-free threshold") +
scale_color_manual(
name = "ATR",
guide = 'legend',
values = c("Data"="blue","Estimated"="red")
) +
scale_x_continuous(label=comma, breaks=seq(0,200000,20000))
ggsave(
here(results,"TP_01_ATR_TaxFunc_Including_below_tax_free.pdf")
)
Saving 6.88 x 4.25 in image
X <- readRDS((here(results,"TP_01_Tau_Tax_code.rds")))
X %>%
filter(term=="tau") %>%
ggplot(aes(x=year,y=value)) +
geom_point() +
geom_line() +
labs(
x = "Year", y = expression(paste("Progressivity ",tau))
) +
scale_x_continuous(breaks = seq(1990,2020,by=2)) +
scale_y_continuous(breaks = seq(0.05,0.2,0.01)) +
theme(
axis.title.x = element_text(margin = margin(t=10)),
axis.title.y = element_text(margin = margin(t=10))
)
ggsave(
here(results,"TP_01_ATR_TaxFunc_Trends_in_tau.pdf")
)
Saving 6.88 x 4.25 in image
We impose the following sample restrictions on the data.
\(y^{taxfree}\le y\le200,000\)
Income concept \(y\) is restricted between the tax free threshold for each year and $200,000 (in real terms). For the years in our sample, the top tax bracket ranges between $70,000 and $193,000
\(0< t \le0.47\)
Those with a negative average tax rate \(t\) and those with a tax rate over 47% are excluded from the sample. The rationale for this is that the top average tax rate between 1991-2019 was either 45% or 47%. As shown in the previous section, excluding those below the tax-free threshold ensures a better fit with actual tax rates.
A <-
read_feather(
here("Raw_data","LI_01_Derived_2019.feather"),
as_data_frame = F
) %>%
mutate(y=ic_taxable_income_loss) %>%
select(pid,age,year,y,inc_tax) %>%
collect()
Z <-
left_join(A,CPI,by="year") %>%
mutate(
across(
c(y,inc_tax),
~./cpi
)
) %>%
filter(inc_tax>=0&y>=0) %>%
mutate(
atr = if_else(y==0,0,inc_tax/y),
x = y-inc_tax # Post-tax income needed for OLS estimation
) %>%
filter(y>=0&y<=200000) %>%
filter(atr>0&atr<=0.47)
B <-
Z %>%
f_estimate_tax(.,Tax_code_data_real) %>%
mutate(tc_atr=if_else(y==0,0,tax/y), # tc_atr is tax code atr
tc_tax=tax)
Z <-
full_join(B,Z,by=c("pid","year","y")) %>%
filter(y>=taxfree&atr>0)
OLS <- f_tax_lm(Z) %>% mutate(method="OLS")
NLS <- f_tax_nls(Z) %>% mutate(method="NLS")
Coefs_temp <- bind_rows(OLS,NLS)
Q <-
Z %>%
mutate(
OLS = 1-OLS$value[1]*y^(-OLS$value[2]),
NLS = 1-NLS$value[1]*y^(-NLS$value[2])
)
QQ <-
Q %>%
slice_sample(prop=0.01) %>%
select(y,atr,tc_atr,OLS,NLS) %>%
rename(Income=y,Actual=atr,Statutory=tc_atr)
QQ %>%
ggplot(aes(x=Income)) +
geom_point(aes(y=Actual*100,color="Data"),show.legend = T) +
geom_line(aes(y=Statutory*100,color="Statutory"), show.legend = T) +
geom_point(aes(y=OLS*100,color="OLS"), show.legend = T) +
geom_line(aes(y=NLS*100,color="NLS"), show.legend = T) +
labs(x="Income",y="Average tax rate (%)") +
scale_color_manual(
name = "ATR",
guide = 'legend',
values = c("Data"="gray",
"Statutory"="black",
"OLS"="red",
"NLS"="green")
) +
scale_x_continuous(label=comma)
ggsave(
here(results,"TP_01_ATR_OLS_NLS_Actual_2019.pdf")
)
Saving 6.88 x 4.25 in image
sourceyears <- c(1991:2019)
for (i in seq_along(sourceyears)) {
A <-
read_feather(here("Raw_data",
paste0("LI_01_Derived_",sourceyears[i],".feather")),
as_data_frame = F) %>%
mutate(y = ic_taxable_income_loss) %>%
select(pid, age, year, y, inc_tax) %>%
collect()
Z <-
left_join(A,CPI,by="year") %>%
mutate(
across(
c(y,inc_tax),
~./cpi
)
) %>%
filter(inc_tax>=0&y>=0) %>%
mutate(
atr = if_else(y==0,0,inc_tax/y),
x = y-inc_tax # Post-tax income needed for OLS estimation
) %>%
filter(y>=0&y<=200000) %>%
filter(atr>0&atr<=0.47)
B <-
Z %>%
f_estimate_tax(.,Tax_code_data_real) %>%
mutate(tc_atr=if_else(y==0,0,tax/y), # tc_atr is tax code atr
tc_tax=tax)
Z <-
full_join(B,Z,by=c("pid","year","y")) %>%
filter(y>=taxfree&atr>0)
OLS <- f_tax_lm(Z) %>% mutate(method="OLS")
NLS <- f_tax_nls(Z) %>% mutate(method="NLS")
Coefs_temp <- bind_rows(OLS,NLS)
Q1 <-
Z %>%
mutate(
t = y-OLS$value[1]*y^(1-OLS$value[2]),
x = y-t,
) %>%
f_indices(.)
Q2 <-
Z %>%
mutate(
t = y-NLS$value[1]*y^(1-NLS$value[2]),
x = y-t,
) %>%
f_indices(.)
QQ <-
A %>%
mutate(
t = inc_tax,
x = y-t
) %>%
f_indices(.)
Coefs_temp <-
bind_rows(OLS,NLS) %>%
mutate(Suits_est = if_else(method=="OLS",Q1$S_area[1],Q2$S_area[1]),
Suits_data = QQ$S_area[1])
if (i==1){
Coefs=Coefs_temp
}
else{
Coefs=bind_rows(Coefs,Coefs_temp)
}
print(sourceyears[i])
}
saveRDS(Coefs,here(results,"TP_01_Tau_Lambda_Suits_data.rds"))
X <-
readRDS(here(results,"TP_01_Tau_Lambda_Suits_data.rds")) %>%
filter(term=="tau")
X %>%
ggplot(aes(x=year,y=value,color=method,shape=method)) +
geom_point() +
geom_line() +
labs(
x = "Year", y = expression(paste("Progressivity ",tau))
) +
scale_x_continuous(breaks = seq(1990,2020,by=2)) +
theme(
axis.title.x = element_text(margin = margin(t=10)),
axis.title.y = element_text(margin = margin(t=10))
)
ggsave(
here(results,"TP_01_Tau_trends_data.pdf")
)
Saving 6.88 x 4.25 in image
X <-
readRDS(here(results,"TP_01_Tau_Lambda_Suits_data.rds")) %>%
filter(term=="tau"&method=="NLS") %>%
rename(Estimate=Suits_est,Data=Suits_data) %>%
pivot_longer(cols = c(Estimate,Data), names_to = "Suits index", values_to = "yy")
X %>%
ggplot(aes(x=year,y=yy,color=`Suits index`,shape=`Suits index`)) +
geom_point() +
geom_line() +
labs(
x = "Year", y = "Suits index (taxable income as base)"
) +
scale_x_continuous(breaks = seq(1990,2020,by=2)) +
theme(
axis.title.x = element_text(margin = margin(t=10)),
axis.title.y = element_text(margin = margin(t=10))
)
ggsave(
here(results,"TP_01_Check_suits_index_tau_estimate.pdf")
)
Saving 6.88 x 4.25 in image