### GETTING STARTED library(survey) # Change this to wherever your data are stored hbai.data.dir <- "~/data/HBAI/HBAI1213/tab/" # Load the source data hbai <- read.delim(sprintf("%s%s", hbai.data.dir, "hbai1213_g4.tab")) # Set up the survey design, using person weights ppl.svy <- svydesign(id=~1, data=hbai, weight=~GS_NEWPP) # Some alternative weights: # GS_NEWCH - numbers of children # GS_NEWAD - numbers of adults # GS_NEWPN - numbers pensioners # Numbers of households - include only main benefit unit hh.svy <- svydesign(id=~1, data=subset(hbai, BENUNIT==1), weight=~GS_NEWHH) ### THE OVERALL DISTRIBUTION OF INCOME (cf Table 2b, p32) # Mean before-housing-costs income - £535.52 / week bhc.mean <- svymean(~S_OE_BHC, ppl.svy) # Deciles of BHC income bhc.deciles <- svyquantile(~S_OE_BHC, ppl.svy, quantiles=seq(0.1, 0.9, 0.1) ) bhc.deciles ### CHARTING THE INCOME DISTRIBUTION (cf Chart 2.4/BHC, p28) # Group incomes up to £1,000 in £10 bands hbai$bhc.inc.10grp <- cut(hbai$S_OE_BHC, seq(0,1000,by=10), right=FALSE, include.lowest=TRUE) # Since we have added variables, survey design must be respecified ppl.svy <- svydesign(id=~1, data=hbai, weight=~GS_NEWPP) # Counts of all people in each band bands.freq <- svytable(~bhc.inc.10grp, ppl.svy) pdf(file="income_dist-bhc.pdf",width=4.7,height=3) # From here on, set up the data specifically for plotting, # as per the chart in the HBAI report # First make a data.frame for plotting bands <- data.frame(ppl=bands.freq, lower=seq(0,990,10), upper=seq(10,1000,10) ) # Round deciles to nearest 10 and assign each £10 band to one deciles.rnd <- signif(bhc.deciles,2) bands$decile <- sapply(bands$upper, function(b) sum(b>deciles.rnd) + 1) # Locations of the numeric labels of each decile dec.labels <- data.frame(dec=1:10, x=(c(0,bhc.deciles) + c(bhc.deciles, 1000))/2) # Actually create the plot library(ggplot2) ggplot(bands)+ geom_rect(aes(xmin=lower, xmax=upper, ymin=0, ymax=ppl.Freq/10^6, fill=factor(decile%%2)) ) + scale_fill_brewer("",type="qual", palette="Paired", guide=FALSE) + scale_y_continuous("People (millions)", limits=c(0,1.5), expand=c(0,0)) + scale_x_continuous("Equivalised BHC income (£/week, £10 bands)", limits=c(0,1050), breaks=seq(0,1000,100), expand=c(0,0)) + geom_text(data=dec.labels, aes(x=x, y=0.05, label=dec), colour="white", fontface="bold", size=10/14*5) dev.off() ### WITHIN-GROUP POVERTY RATES (cf Table 3.5db) # According to economic status of adults # A dummy variable for counting whole group populations hbai$all <- 1 # Whether (at-risk-of) poverty according to various poverty thresholds: # 50%, 60%, and 70% of median AHC income hbai$ahc.poor.50 <- ( hbai$S_OE_AHC < hbai$MDOEAHC * 0.5 ) hbai$ahc.poor.60 <- ( hbai$S_OE_AHC < hbai$MDOEAHC * 0.6 ) hbai$ahc.poor.70 <- ( hbai$S_OE_AHC < hbai$MDOEAHC * 0.7 ) # Set up some labels for economic status econ.statuses <- c("1+ F/T self-employed", "Single/Couple, all in F/T work", "Couple, 1 F/T, 1 P/T", "Couple, 1 F/T, 1 not working", "No full time, 1+ P/T", "Workless, 1+ aged 60+", "Workless, 1+ unemployed", "Workless, other inactive") hbai$ad.ec.stat <- factor(econ.statuses[hbai$ECOBU], levels=econ.statuses) # Redefine the survey ppl.svy <- svydesign(id=~1, data=hbai, weight=~GS_NEWPP) # Calculate the proportion in low income for each threshold & group eact.pov <- svyby(~ahc.poor.50+ahc.poor.60+ahc.poor.70, ~ad.ec.stat, design=ppl.svy, svyratio, denominator=~all) # eact.pov ### BETWEEN-YEAR COMPARISON OF INCOME DISTRIBUTION # Use the HBAI deflators to match the published results. # The income deflators given in the HBAI User Guide. all.years <- data.frame( year1=1994:2012, ahc.deflator=c(148.8, 153.0, 155.9, 159.0, 159.7, 162.5, 161.8, 164.5, 166.8, 169.4, 171.5, 174.5, 179.8, 184.6, 192.7, 198.8, 209.7, 222.0, 229.5) ) # Give each year its proper label (e.g. "2012/13") all.years$label <- sapply(all.years$year1, function(yr) sprintf("%s/%s", yr, substring(sprintf("%i", yr+1), 3, 4 ) ) ) # Find the corresponding data files for each year. There are slight # variations in how they are named. all.years$data.file <- sapply(all.years$year1, function(yr1) Sys.glob(sprintf("%shbai%s*.tab", hbai.data.dir, substring(sprintf("%i",yr1), 3, 4) ) ) ) # Read all the data files in one go and keep them in memory. This can be # convenient, but uses more memory (≈700MB). Otherwise, read and process # one file at a time. all.years.data <- sapply(all.years$data.file, read.delim) # Function to give quintile mid-points and mean of AHC income yr.quints.and.mean <- function(yr.data) { svy.ppl <- svydesign(ids=~1, weights=~GS_NEWPP, data=yr.data) c(svyquantile(~S_OE_AHC, svy.ppl, seq(0.1, 0.9, 0.2)), svymean(~S_OE_AHC, svy.ppl)) } # Get the nominal (in-year) values and means nominal.qiles <- sapply(all.years.data, yr.quints.and.mean) # Convert all our values to 2012/13 terms (index 229.5) # = value / source year's index * target year's index real.qiles <- apply(nominal.qiles, 1, function(nom.val) mapply(prod, nom.val, 1/all.years$ahc.def, 229.5) ) # Label and show the output rownames(real.qiles) <- all.years$label colnames(real.qiles) <- c(paste("Quintile",1:5), "Mean") # real.qiles pdf(file="gini-series.pdf",width=4.7,height=3) ### MEASURES OF INCOME INEQUALITY # Needed to calculate Gini coefficient from weighted data library(reldist) # Calculate BHC and AHC gini (all-people-weighted) gini.bhc <- sapply(all.years.data, function(d) gini(d$S_OE_BHC, d$GS_NEWPP)) gini.ahc <- sapply(all.years.data, function(d) gini(d$S_OE_AHC, d$GS_NEWPP)) # A data frame for plotting ginis <- data.frame(year=all.years$label, gini.bhc=gini.bhc, gini.ahc=gini.ahc) # In order to convert to "long" format library(reshape2) # Note that ggplot2 (rightly) does not allow the use of multiple y-axes, # so we can't plot the 90/10 ratio on the same chart ggplot(melt(ginis),aes(x=year, y=round(value*100), colour=variable, group=variable)) + geom_path() + geom_point() + scale_y_continuous("Gini coefficient", limits=c(30,42), breaks=seq(30,42,2)) + scale_colour_brewer("Income variable", type="qual", palette="Paired") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) dev.off() ### ESTIMATING ERROR OF ESTIMATES # Estimates of children and all people in poverty ppl.svy <- svydesign(id=~1, data=hbai, weight=~GS_NEWPP) ppl.pov <- svyratio(~ahc.poor.60, denominator=~all, design=ppl.svy, level=0.95) kid.svy <- svydesign(id=~1, data=hbai, weight=~GS_NEWCH) kid.pov <- svyratio(~ahc.poor.60, denominator=~all, design=kid.svy, level=0.95) # The assumed design factor des.fac <- 1.3 # The adjusted lower and uppper CIs for all individuals ppl.pov$ratio - ( SE(ppl.pov) * des.fac ) ppl.pov$ratio + ( SE(ppl.pov) * des.fac ) # Table showing this cis <- data.frame(estimate=c( ppl.pov$ratio, kid.pov$ratio), ci.low=c( ppl.pov$ratio - ( SE(ppl.pov) * des.fac ), kid.pov$ratio - ( SE(kid.pov) * des.fac ) ), ci.high=c( ppl.pov$ratio + ( SE(ppl.pov) * des.fac ), kid.pov$ratio + ( SE(kid.pov) * des.fac ) ) ) # Convert to percentages cis <- cis * 100 rownames(cis) <- c("All individual", "Children")