/* BOOTGINI.SAS - Bootstrap Gini coefficients from a data set */ /* Philip Dixon, Mar 1992 */ /* read the data and create two data sets: boot: contains the original data and all the bootstrap samples skew: contains the original data and samples needed to estimate the a coefficient in the accelerated bootstrap */ data boot skew; array data{200} d1-d200; /* array to store the original data, set up for a maximum of 200 data points. Increase here and in next statement if necessary. */ retain d1-d200 /* tell SAS not to clear the array */ npt 0; /* NPT counts the number of data points */ keep iboot /* IBOOT is the number of the bootstrap sample, 0 = original data */ iskew /* ISKEW is the number of sample in the skew data set, 0 = original data */ x; /* X is an observed value */ infile cards eof=write; /* goto the section labelled write when done reading the data */ /* USER specified parameters */ nboot = 1000; /* number of bootstrap samples to draw */ alpha = 0.05; /* 1 - size of confidence interval */ /* read in original data and copy to boot data set */ iboot = 0; /* iboot = 0 => working with original data */ input x @@; /* read a data value, possibly more than one on a line */ npt = npt + 1; /* update the number of data values */ data{npt} = x; /* store this value in the data array */ output boot; /* output to the boot data set */ return; /* and go get another input value */ /* after reading all the data, create the boot and skew data sets */ write: /* generate nboot bootstrap samples and output them */ do iboot = 1 to nboot; /* loop executed once for each sample */ do i = 1 to npt; /* loop executed once for each point */ pt = 1+floor(npt*uniform(0)); /* randomly choose an integer from 1 to npt, with replacement */ x = data{pt}; /* find that data value in the array */ output boot; /* and store it on the data set */ end; end; /* use SAS macro variables to store useful pieces of information between data steps */ call symput("npt",npt); /* the number of data points */ call symput("nboot",nboot); /* the number of bootstrap samples */ call symput("alpha",alpha); /* and the alpha level */ /* The accelated bootstrap needs the coefficient a, which is the skewness of the influence function. Estimate this by the skewness of the empirical influence estimates. For each data point, compute the empirical influence by comparing the statistic calculated from the original data set repeated 5 times with the statistic calculated from 6 repeats of the target data point and five repeats of all other data set. */ t = 5; /* number of copies of each point */ call symput("t",t); /* store in a macro variable for future use */ do iskew = 0 to npt; /* loop to repeat once for each data point, ISKEW = 0 indicates just multiplying original data, not adding any points. */ if iskew > 0 then do; /* If need to add a point, do it*/ x = data{iskew}; /* by finding that data value in the array */ output skew; /* and storing it on the skew data set */ end; do j = 1 to npt; /* loop to add t copies of every data value */ x = data{j}; /* find the j'th value from the array */ do k = 1 to t; /* and add t copies to the skew data set */ output skew; end; end; end; /* USER adds raw data here. Current input command is set up to read values of one variable. These may be stored 1 value per line or more than one value per line. An unlimited number of lines of data is permitted. If more than 200 data values, users needs to increase the size of the data array and increase the number of d variables used (see top of this data step */ cards; 18 18 20 23 25 28 ; /* Calculate Gini coefficient for the original data and each bootstrap sample. To bootstrap other statistics, replace the following PROC SORT and DATA step with a procedure that calculates a statistic BY IBOOT and stores it in a variable named bootx on an output data set named boots. */ proc sort data=boot; /* sort each bootstrap sample so that the */ by iboot x; /* values are in increasing order. This */ /* is required by the Dixon et al algorithm */ /* see Ecology /* to calculate the Gini coeff. */ data boots; set boot; file 'boot6.dat'; retain ginisum /* Accumulates (2i-n-1)*X */ sum /* Accumulates X, so we can calculate mean */ ipt /* index of data value, 1 for smallest, /* npt for largest value. */ npt; /* number of values in this sample */ by iboot; /* Need to do these calculations separately /* for each bootstrap sample */ if first.iboot then do; /* if this is the first value in the sample */ ginisum = 0; /* initialize sums and counters */ ipt = 0; sum = 0; npt = symget("npt"); end; ipt = ipt + 1; /* Then add 1 to the number of data points */ sum = sum + x; /* Add the value to the sum */ ginisum = ginisum + (2*ipt - (npt + 1))*x; /* and add (2i-n-1)X to its sum */ if last.iboot then do; /* if just did the last value in the sample, */ bootx = ginisum /((npt-1)*sum); /* calculate Gini coeff. for sample */ output; /* store on boots data set */ if iboot = 0 then call symput("sample",bootx); /* if this was the Gini coeff for the obs. */ /* data, store it it a macro variable */ put bootx; end; keep bootx iboot; /* boots data set only needs the estimate */ /* from the bootstrap sample (BOOTX) and the */ /* number of the bootstrap sample (IBOOT, */ /* needed to keep the original sample (IBOOT=0) */ /* separate from the bootstrap reps (IBOOT > 0) */ /* Calculate the Gini coefficient for each set of data in SKEW. Each unique set identifed by a unique value of ISKEW. N.B. This is needed only to compute the a coefficient for the accelerated bootstrap c.i. Algorithm identical to that used for the bootstrap replicates above, except that the number of points in each skew data is N*T or N*T+1 */ proc sort data=skew; by iskew x; data skews; set skew; retain ginisum ipt sum npt; by iskew; if first.iskew then do; ginisum = 0; ipt = 0; sum = 0; npt = symget("npt") * symget("t"); /* # pts in unperturbed skew data */ if iskew = 0 then npt = npt + 1; /* # one extra pt in each perturbed */ end; ipt = ipt + 1; sum = sum + x; ginisum = ginisum + (2*ipt - (npt + 1))*x; if last.iskew then do; bootx = ginisum /((npt-1)*sum); output; end; keep bootx iskew; /* calculate influence function for each data point. Again, only needed to get the a coefficient for the accel. bootstrap */ data _null_; /* Won't create an output data set */ retain s0 /* S0 is the Gini coeff. in the original data */ u3 0 /* U3 accumulates third moments of I */ u2 0; /* U2 accumulates second moments of I, */ /* both are used to calculate skewness */ set skews; /* work with the Gini coeff calculated */ /* for each perturbed data set */ npt = symget("npt"); /* NPT = number of points in original data, */ /* used here to figure out # skew data sets */ if iskew = 0 then s0 = bootx; /* remember the original Gini value */ else do; /* if value from a perturbed data set, */ * infl = (bootx - s0) / (symget("t") + 1); /* calculate influence */ infl = (bootx - s0) / (npt*symget("t") + 1); /* calculate influence */ u3 = u3 + infl**3; /* and accumulate 3'rd */ u2 = u2 + infl**2; /* and 2'nd moments */ end; if iskew = npt then do; /* if this is the last perturbed data set, */ a = u3/(6*u2**(3/2)); /* calculate skewness of influence */ call symput("a", a); /* and store it in a macro variable */ end; /* Calculate the percentiles for the bias-corrected and accelerated bootstraps */ data _null_; /* Again, won't create an output data set */ set boots; retain m0 /* M0: Gini coeff. in original data set */ p0 0; /* P0 will count number of bootstrap */ /* Gini coeff. less than M0. This is */ /* used to get the z0 coefficient for */ /* bias correction */ if iboot = 0 then m0 = bootx; /* store the original Gini coefficient */ else do; /* if a bootstrap repl, */ if bootx < m0 then /* see if less than original value. If so, */ p0 = p0 + 1; /* add 1 to P0 */ end; /* code below here does the calculations to get the percentiles */ if iboot = symget("nboot") then do; /* after looking at all bootstrap repl */ alpha = symget("alpha"); /* remember what alpha level was */ a = symget("a"); /* and we calculated for a coeff. */ z0 = probit(p0 / symget("nboot")); /* calculate z0 coefficient */ za2 = probit(1-alpha/2); /* use alpha to calculate crit. value */ /* will be 1.96 for alpha = 0.05 */ /* percentiles for percentile confidence interval */ pl = 100*alpha/2; /* these depend just on alpha */ pu = 100*(1-alpha/2); /* percentiles for bias-corrected ci */ plbc = 100*probnorm(2*z0-za2); /* these depend on z0 */ pubc = 100*probnorm(2*z0+za2); /* percentiles for accelerated method */ plbca = 100*probnorm(z0 + (z0-za2)/(1-a*(z0-za2))); /* these depend on z0 */ pubca = 100*probnorm(z0 + (z0+za2)/(1-a*(z0+za2))); /* and a */ /* store percentiles in macro variables to be used by proc univariate */ call symput("pl",pl); call symput("pu",pu); call symput("plbc",plbc); call symput("pubc",pubc); call symput("plbca",plbca); call symput("pubca",pubca); /* also store 1-alpha and the z0 value to use in a title */ call symput("ci",100*(1-alpha)); call symput("z0",z0); end; /* find the appropriate percentiles of the bootstrap distribution */ proc univariate data=boots; where iboot > 0; /* Don't include sample value */ var bootx; output out=perc /* store percentiles in a data set */ pctlpts = &pl &pu &plbc &pubc &plbca &pubca /* specifies desired %'tiles */ pctlname=ci_l ci_u bc_l bc_u a_l a_u /* and their variable names */ pctlpre = boot; /* needed, but ignored by SAS v 6.07 */ /* Finally, print the values for each confidence interval */ /* " are necessary - takes advantage of a SAS version 6 trick to */ /* expand the macro variables inside titles */ proc print noobs; title "&ci percent bootstrap confidence intervals"; title2 "Sample value: &sample"; title3 "&nboot bootstrap samples, constants: z0 = &z0, a = &a"; title4 "percentiles for the bc interval are: &plbc, &pubc"; title5 "percentiles for the acclerated interval are: &plbca, &pubca"; run;