%macro km(dataset, chemid, varid, censid, censval, outdata); /* nonparametric estimates of mean and variance for data */ /* with below detection limit observations */ /* method is to estimate cdf of observations, accounting for censoring */ /* using Kaplan-Meier estimate, then use the empirical cdf to */ /* estimate mean and sd of log(Y) and Y. */ /* SAS's estimate of F() assumes right censoring, adapt to left censoring */ /* by working with 1/Y. */ /* Warning: if the value of leftprob is large (i.e. > 0.3 or > 0.5), then */ /* the NP approach is very dependent on an arbitrary assumption about the */ /* small censored values. If so, this approach probably should not be used */ /* See discussion under leftprob for more details */ /* Input: dataset: name of SAS data set containing the observations */ /* chemid: name of variable identifying BY groups */ /* varid: name of variable identifying response */ /* N.B. Need to use Y, not log Y, here */ /* censid: name of variable identifying censored obs */ /* censval: value of &censid identifying a censored obs */ /* outdata: name a SAS data set to contain estimates */ /* example of use of censid and censval given in comments to %lr (in lrestv9.sas) */ /* Output: */ /* mean, sd: estimates of mean and sd. of response */ /* logmean, logsd: estimates of mean and sd. of log response */ /* geommean: estimate of geometric mean, exp(logmean) */ /* cv: estimate of coefficient of variation */ /* skew: estimate of skewness, derived from mean and median, */ /* not from the third moment */ /* leftprob: estimate of F(Ymin) when Ymin is censored */ /* The KM estimate of F(y) is undefined for y <= Ymin when Ymin is */ /* a censored value. If the smallest obs is censored */ /* this macro arbitrarily defines F(Ymin) by putting the appropriate */ /* probability as a point mass at Ymin/2. */ /* In other words, the NP estimates substitute DL/2 when the */ /* the smallest obs. is censored. This is ONLY done for the */ /* smallest obs. Larger censored values are treated properly */ /* The effect of this assumption is minimal if leftprob is small */ /* and can be considerably if leftprob is large (e.g. > 0.3 or >0.5) */ /* The evaluation of this is yet another paper awaiting some time */ data XXnew; set &dataset; ix = 1/abs(&varid); keep &chemid ix &censid; proc sort; by &chemid; /* compute KM estimate of cdf of 1/value */ proc lifetest method = km noprint outsurv = XXcdf2; by &chemid; time ix*&censid(&censval); /* since largest value of ix = smallest value of x gets S(t) = 0 */ /* need to add 1/(2n) to all values of S(t) */ /* first figure out N for this chemical. Ignore missing values */ proc means noprint data = XXnew; by &chemid; var ix; output out = XXn n = n; /* compute plotsurv to draw qqplots */ data XXcdf; merge XXcdf2 XXn; by &chemid; /* if survival = 1, then lowest obs are censored. Don't add anything */ if survival < 1 then plotsurv = survival + 1/(2*n); else plotsurv = survival; data &outdata; set XXcdf; by &chemid; retain mean median geommean logmean sx2 slx2 sd logsd cv ; if first.&chemid then do; mean = 0; sx2 = 0; slx2 = 0; logmean = 0; median = .; junk = lag(survival); end; if (_censor_ = 0) or last.&chemid then do; ls = lag(survival); lix = lag(ix); if _censor_ = 0 then pdf = ls - survival; else do; /* last obs is censored, repl with dl/2 */ pdf = ls; ix = 2*ix; end; if first.&chemid then pdf = .; if (pdf ne .) then do; mean = mean + pdf/ix; sx2 = sx2 + pdf/(ix*ix); logmean = logmean - pdf*log(ix); slx2 = slx2 + pdf*log(ix)**2; /* find the median: if S(x) = 0.5 exactly average two obs */ if (ls = 0.5) then median = (0.5/ix)+(0.5/lix); /* if S(x) crosses 0.5, then use current value */ if (ls > 0.5) & (survival < 0.5) then median = 1/ix; end; end; if last.&chemid then do; sd = sqrt(sx2-mean**2); logsd = sqrt(slx2-logmean**2); skew = (mean-median)/sd; geommean = exp(logmean); cv = sd/mean*100; if _censor_ = 1 then leftprob = ls; else leftprob = 0; output; end; drop ix sx2 survival sdf_lcl sdf_ucl junk lix pdf _censor_ ls plotsurv; %mend;