mc_fv_epsilon_greedy.py

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/env python
# -*- coding: ascii -*-
from __future__ import print_function
from __future__ import unicode_literals
from future import standard_library
standard_library.install_aliases()
from builtins import str
from builtins import range
from builtins import object
import random

from introrl.policy import Policy
from introrl.agent_supt.action_value_run_ave_coll import ActionValueRunAveColl

from introrl.utils.banner import banner
from introrl.agent_supt.episode import Episode
from introrl.agent_supt.episode_maker import make_episode
from introrl.agent_supt.epsilon_calc import EpsilonGreedy

def mc_epsilon_greedy( environment, initial_policy='default', # can be 'default', 'random', policy_dictionary
                       read_pickle_file='', 
                       save_pickle_file='',
                       use_list_of_start_states=True, # use list OR single start state of environment.
                       iter_all_start_actions=False, # pick random or iterate all starting actions
                       first_visit=True, 
                       do_summ_print=True, showRunningAve=False, fmt_Q='%g', fmt_R='%g',
                       show_initial_policy=True,
                       max_num_episodes=1000, min_num_episodes=10, max_abserr=0.001, gamma=0.9,
                       iteration_prints=0,
                       max_episode_steps=10000,
                       epsilon=0.1, const_epsilon=True, half_life=200,
                       N_episodes_wo_decay=0):
    """
    ... GIVEN AN ENVIRONMENT ... 
    apply Monte Carlo Exploring Starts to find the OPTIMAL POLICY
    
    Returns: Policy and ActionValueRunAveColl objects
    
    Use Episode Discounted Returns to find Q(s,a), Action-Value Function
    
    Terminates when abserr < max_abserr
    
    Assume that Q(s,a), action_value_ave, has been initialized prior to call.
    
    Assume environment attached to policy will have method "get_any_action_state_hash"
    in order to begin at any action state.
    
    CREATES BOTH policy AND action_value OBJECTS.
    """
    
    eps_greedy = EpsilonGreedy(epsilon=epsilon, const_epsilon=const_epsilon, half_life=half_life,
                               N_episodes_wo_decay=N_episodes_wo_decay)
    
    # create Policy and ActionValueRunAveColl objects
    policy = Policy( environment=environment )
    if initial_policy=='default':
        print('Initializing Policy to "default" in mc_epsilon_greedy')
        policy.learn_a_legal_action_from_env( env=environment )
        policy.set_policy_from_piD( environment.get_default_policy_desc_dict() )
    elif initial_policy=='random':
        print('Initializing Policy to "random" in mc_epsilon_greedy')
        policy.intialize_policy_to_random(env=environment)
    else:
        print('Initializing Policy to "custom policy" in mc_epsilon_greedy')
        policy.learn_a_legal_action_from_env( env=environment )
        policy.set_policy_from_piD( initial_policy )


    action_value_ave = ActionValueRunAveColl( environment )
    action_value_ave.init_Qsa_to_zero() # Terminal states w/o an action are NOT included
    #action_value_ave.summ_print()

    if read_pickle_file:
        policy.init_from_pickle_file( read_pickle_file )
        action_value_ave.init_from_pickle_file( read_pickle_file )
    
    if do_summ_print:
        if show_initial_policy:
            print('=============== STARTING WITH THE INITIAL POLICY ====================')
            policy.summ_print( verbosity=0, environment=environment, 
                       show_env_states=False, none_str='*')

        print('================== EPSILON GREEDY DEFINED AS ========================')
        eps_greedy.summ_print()
                   
    s = 'Starting a Maximum of %i Monte Carlo Epsilon Greedy\nfor "%s" with Gamma = %g'%\
        (max_num_episodes, environment.name, gamma)
    banner(s, banner_char='', leftMargin=0, just='center')
    
    
    # create an Episode object for getting returns
    episode = Episode( environment.name + ' Episode' )
    
    # set counter and flag
    num_episodes = 0
    keep_looping = True
    
    limited_start_stateL = environment.limited_start_state_list()
    
    progress_str = ''
    while (num_episodes<=max_num_episodes-1) and keep_looping :
        
        keep_looping = False
        abserr = 0.0 # calculated below as part of termination criteria
        Nterminal_episodes = set()
        
        # Iterate over a list of known possible start states
        if use_list_of_start_states:
            loop_stateL = limited_start_stateL
            random.shuffle( loop_stateL )
        else:
            #loop_stateL = [ random.choice( limited_start_stateL ) ]
            loop_stateL = [ environment.start_state_hash ]
        
        for start_hash in loop_stateL:
            
            if iter_all_start_actions:# Iterate over ALL ACTIONS of start_hash
                a_descL = environment.get_state_legal_action_list( start_hash )
            else:
                a_desc = policy.get_single_action( start_hash )
                # if not iterating all actions, make sure first action has eps-greedy applied
                a_desc = eps_greedy( a_desc, 
                                     environment.get_state_legal_action_list( start_hash ) )
                a_descL = [ a_desc ]
            # randomize action order
            random.shuffle( a_descL )
            
            for a_desc in a_descL:
                
                # break from inner loop if max_num_episodes is hit.
                if num_episodes >= max_num_episodes:
                    break
                
                make_episode(start_hash, policy, 
                             environment, environment.terminal_set, 
                             episode=episode, first_a_desc=a_desc,
                             max_steps=max_episode_steps, eps_greedy=eps_greedy)
                eps_greedy.inc_N_episodes()
                num_episodes += 1
                
                if episode.is_done():
                    Nterminal_episodes.add( start_hash )
            
                for dr in episode.get_rev_discounted_returns( gamma=gamma, 
                                                              first_visit=first_visit, 
                                                              visit_type='SA'):
                    # look at each step from episode and calc average Q(s,a)
                    (s, a, r, sn, G) = dr
                    action_value_ave.add_val( s, a, G)
                    
                    aL = environment.get_state_legal_action_list( s )
                    if aL:
                        best_a_desc, best_a_val = aL[0], float('-inf')
                        bestL = [best_a_desc]
                        for a in aL:
                            q = action_value_ave.get_ave( s, a )
                            if q > best_a_val:
                                best_a_desc, best_a_val = a, q
                                bestL = [ a ]
                            elif q == best_a_val:
                                bestL.append( a )
                        best_a_desc = random.choice( bestL )
                        policy.set_sole_action(s, best_a_desc)
                
        abserr = action_value_ave.get_biggest_action_state_err()
        if abserr > max_abserr:
            keep_looping = True
            
        if num_episodes < min_num_episodes:
            keep_looping = True # must loop for min_num_episodes at least
        
        pc_done = 100.0 * float(num_episodes) / float(max_num_episodes)
        out_str = '%3i%%'%( 5*(int(pc_done/5.0) ) )
        if out_str != progress_str:
            score = environment.get_policy_score( policy=policy, start_state_hash=None, step_limit=1000)
            print(out_str, ' score=%s'%str(score), ' = (r_sum, n_steps, msg)', end=' ')
            print( 'Nterminal episodes =', len(Nterminal_episodes),' of ', len(loop_stateL))
            progress_str = out_str
    print()
    if do_summ_print:
        s = ''
        if num_episodes >= max_num_episodes:
            s = '   (NOTE: STOPPED ON MAX-ITERATIONS)'

        print( 'Exited Epsilon Greedy, MC First-Visit Value Iterations', s )
        print( '   num episodes    =', num_episodes, ' (min limit=%i)'%min_num_episodes, ' (max limit=%i)'%max_num_episodes )
        print( '   gamma           =', gamma )
        print( '   estimated err   =', abserr )
        print( '   Error limit     =', max_abserr )
        print( 'Nterminal episodes =', len(Nterminal_episodes),' of ', len(loop_stateL))
    
        action_value_ave.summ_print(showRunningAve=showRunningAve, fmt_Q=fmt_Q )
        policy.summ_print(  environment=environment, verbosity=0, show_env_states=False  )
        
        try: # sims may not have a layout_print
            environment.layout_print( vname='reward', fmt=fmt_R, show_env_states=False, none_str='*')
        except:
            pass

    if save_pickle_file:
        policy.save_to_pickle_file( save_pickle_file )
        action_value_ave.save_to_pickle_file( save_pickle_file )
        
    return policy, action_value_ave

if __name__ == "__main__": # pragma: no cover
    
    from introrl.mdp_data.simple_grid_world import get_gridworld    
    gridworld = get_gridworld()
    
    
    policy, action_value = mc_epsilon_greedy( gridworld, initial_policy='default',
                                              first_visit=True, 
                                              do_summ_print=True, showRunningAve=False, 
                                              fmt_Q='%g', fmt_R='%g',
                                              max_num_episodes=1000, min_num_episodes=10, 
                                              max_abserr=0.001, gamma=0.9,
                                              iteration_prints=0)