function [Q_sarsa,Q_qlearn,rpt_sarsa,rpt_qlearn,n_sarsa,n_qlearn] = learn_cw(alpha,CF,s_start,s_end,MAX_N_EPISODES) % LEARN_CW - Performs on-policy sarsa and Q-learning to learn the policy for the % cliff walking problem example. % % Written by: % -- % John L. Weatherwax 2007-12-03 % % email: wax@alum.mit.edu % % Please send comments and especially bug reports to the % above email address. % %----- PLOT_STEPS = 0; gamma = 1; % <- take this is an undiscounted task epsilon = 0.1; % for our epsilon greedy policy % the number of states: [sideII,sideJJ] = size(CF); nStates = sideII*sideJJ; % on each grid we can choose from among this many actions (except on edges where this action is reduced): nActions = 4; % An array to hold the values of the action-value function Q_sarsa = zeros(nStates,nActions); Q_qlearn = zeros(nStates,nActions); rpt_sarsa = zeros(1,MAX_N_EPISODES); rpt_qlearn = zeros(1,MAX_N_EPISODES); n_sarsa = zeros(nStates,nActions); % <- lets store the number of times we are in this state and take this action n_qlearn = zeros(nStates,nActions); if( PLOT_STEPS ) figure; imagesc( CF ); colorbar; hold on; plot( s_start(2), s_start(1), 'x', 'MarkerSize', 10, 'MarkerFaceColor', 'k' ); plot( s_end(2), s_end(1), 'o', 'MarkerSize', 10, 'MarkerFaceColor', 'k' ); end % keep track of how many timestep we take per episode: ets = zeros(MAX_N_EPISODES,1); ts=0; for ei=1:MAX_N_EPISODES, tic; if( ei==1 ) fprintf('working on episode %d...\n',ei); else fprintf('working on episode %d (ptt=%10.6f secs)...\n',ei, toc); tic; end ets(ei,1) = ts+1; % initialize the starting state st_sarsa = s_start; sti_sarsa = sub2ind( [sideII,sideJJ], st_sarsa(1), st_sarsa(2) ); st_qlearn = s_start; sti_qlearn = sub2ind( [sideII,sideJJ], st_qlearn(1), st_qlearn(2) ); % pick action using an epsilon greedy policy derived from Q: % [dum,at_sarsa] = max(Q_sarsa(sti_sarsa,:)); % at \in [1,2,3,4]=[up,down,right,left] if( randsideII ) stp1(1)=sideII; end if( stp1(2)<1 ) stp1(2)=1; end if( stp1(2)>sideJJ ) stp1(2)=sideJJ; end % get the reward for this step: % if( (ii==s_end(1)) && (jj==s_end(2)) ) % were at the end :) %rew = +1; rew = 0; elseif( CF(stp1(1),stp1(2))==0 ) % we fell off the cliff :( rew = -100; stp1 = s_start; else % normal step rew = -1; end