% % epage 325 % % Written by: % -- % John L. Weatherwax 2008-02-20 % % email: wax@alum.mit.edu % % Please send comments and especially bug reports to the % above email address. % %----- clear all; close all; clc; addpath('../../Code/CSTool'); % the number of samples to use in constructing our histogram: % n = 5000; randn('seed',0); rand('seed',0); % generate data from a univariate normal random variable: x = randn(1,n); % % THE HISTOGRAM: % % specify the bin width using the normal reference rule for histograms: % s = std(x); h = 3.5*s*(length(x)).^(-1/3); % get the limits, bins, bin centers etc: x_lim_left = -4; x_lim_rght = +4; t0 = x_lim_left; tm = x_lim_rght; rng = tm-t0; nbin = ceil(rng/h); bins = t0:h:(nbin*h+t0); % <- the bin edges ... bc = bins(1:end-1)+0.5*h; % <- the bin centers ... x(find(xx_lim_rght))=x_lim_rght; vk=histc(x,bins); vk(end)=[]; % normalize: fhat = vk/(n*h); fh = figure; stairs( bins, [fhat,fhat(end)], '-r' ); grid on; hold on; axis( [x_lim_left, x_lim_rght, 0, +Inf] ); % % THE FREQUENCY POLYGON: % % specify the bin width using the normal reference rule for frequency polygons: % s = std(x); h = 2.15*s*(length(x)).^(-1/5); t0 = x_lim_left; tm = x_lim_rght; rng = tm-t0; bins = t0:h:tm; vk = histc(x,bins); vk(end)=[]; fhat = vk/(n*h); bc = (t0-h/2):h:(tm+h/2); % <- the bin centers ... with an empty bin center on each end ... binh = [0 fhat 0]; xinterp = linspace( x_lim_left, x_lim_rght ); fp = interp1(bc,binh,xinterp); figure(fh); plot( xinterp, fp, '-x' ); axis( [x_lim_left, x_lim_rght, 0, +Inf] ); % % THE KERNEL DENSITY ESTIMATE (using the normal reference rule for Gaussian kernels) % % (at the points xinterp) % fhat = zeros(size(xinterp)); h = 1.06*n^(-1/5); for i=1:n, f = normpdf(xinterp,x(i),h); fhat = fhat + f/n; end figure(fh); plot( xinterp, fhat, '-md' ); axis( [x_lim_left, x_lim_rght, 0, +Inf] ); % plot the TRUTH: % figure(fh); plot( xinterp, normpdf(xinterp,0,1), '-og' ); xlabel('x'); ylabel('PDF'); axis( [x_lim_left, x_lim_rght, 0, +Inf] ); title( 'normal distribution (normal reference rule)' ); saveas(gcf,'prob_8_4_norm_nrr','epsc'); %pause; % % Do a Monte-Carlo study to compute the average MSE $(\hat{f}(x)-f(x))^2$ % ... for each PDF approximation method ... % % N_MC = 100; xinterp = linspace( x_lim_left, x_lim_rght, 150 ); all_mse = zeros(3,length(xinterp),N_MC); amse = zeros(3,1); for mci=1:N_MC, x = randn(1,n); % get new data % the histogram estimate: [bc,fhat,bins] = norm_ref_rule_hist(x); pdf_approx = interp1(bc,fhat,xinterp,'nearest',0); sefx = (pdf_approx - normpdf(xinterp,0,1)).^2; all_mse(1,:,mci) = sefx; amse(1) = amse(1) + (1/N_MC)*trapz(xinterp,sefx); % the frequencey polygon estimate: [fp,fpx] = csfreqpoly(x); delete(gcf); pdf_approx = interp1(fpx,fp,xinterp,'linear',0); sefx = (pdf_approx - normpdf(xinterp,0,1)).^2; all_mse(2,:,mci) = sefx; amse(2) = amse(2) + (1/N_MC)*trapz(xinterp,sefx); % the kernel density estimate (evaluated AT xinterp): fhat = cskern1d(xinterp,x); sefx = (fhat - normpdf(xinterp,0,1)).^2; all_mse(3,:,mci) = sefx; amse(3) = amse(3) + (1/N_MC)*trapz(xinterp,sefx); end fprintf('The Averge (over x) MSE for each method is: hist=%10.5f; freqPol=%10.5f; kernel=%10.5f\n',amse(1),amse(2),amse(3)); % plot the distribution of MSE over our domain "x": % t_all_mse = squeeze( mean( all_mse, 3 ) ); figure; ph=plot( xinterp, t_all_mse, '-' ); title( 'MSE error as a function of the domain x' ); legend(ph,{'histogram MSE', 'frequency poly MSE', 'Kernel density MSE'}); xlabel( 'x' ); ylabel( 'MSE' ); saveas( gcf, 'prob_8_4_x_mse', 'epsc' );