sample_sim.py

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python
# -*- coding: ascii -*-
from __future__ import print_function
from __future__ import unicode_literals
from future import standard_library
standard_library.install_aliases()
from builtins import str
from builtins import range
from builtins import object

import random
from introrl.black_box_sims.sim_baseline import Simulation

# define layout to create output displays
row_1 = [ (0,0), (0,1),   (0,2), 'Goal' ]
row_2 = [ (1,0),'"Wall"', (1,2), 'Pit' ]
row_3 = [ 'Start', (2,1),   (2,2), (2,3) ]
s_hash_rowL=[row_1, row_2, row_3]

# add layout row and column markings (if any)
row_tickL=[ 0, 1, 2]
col_tickL=[ 0, 1, 2, 3]
x_axis_label='cols'
y_axis_label='rows'

# one way to define actions is an explicit dict of actions.
# (can also simply provide logic within a function to define actions)
actionD = {(0, 0): ('D', 'R'),
           (0, 1): ('L', 'R'),
           (0, 2): ('L', 'D', 'R'),
           (1, 0): ('U', 'D'),
           (1, 2): ('U', 'D', 'R'),
           'Start': ('U', 'R'),
           (2, 1): ('L', 'R'),
           (2, 2): ('L', 'R', 'U'),
           (2, 3): ('L', 'U')  }

# define rewards
rewardD = {'Goal': 1, 'Pit': -1}


class SampleSimulation( Simulation ):
    
    def __init__(self, name='Sample Gridworld Sim',
                 step_reward=-0.04, 
                 random_transition_prob=0.2):
                     
        """A Simulation of a Sample Gridworld"""
        
        self.step_reward = step_reward
        
        # probability of moving in random direction.
        self.random_transition_prob = random_transition_prob
        
        # call parent object
        Simulation.__init__(self, name=name, 
                            s_hash_rowL=s_hash_rowL,
                            row_tickL=row_tickL, 
                            col_tickL=col_tickL, 
                            x_axis_label=x_axis_label, 
                            y_axis_label=y_axis_label)

        
        # state hash is
        self.action_state_set = set( actionD.keys() ) # a set of state hashes
    
        self.terminal_set = set( rewardD.keys() )

        # if there is a start state, define it.
        self.start_state_hash = 'Start'

    def get_action_snext_reward(self, s_hash, a_desc):
        """
        Return next state, sn_hash, and reward
        """
        # default is 80% take input a_desc, 20% choose randomly
        if random.random() < self.random_transition_prob:
            a_desc = random.choice( actionD[s_hash] )
    
        # put 'Start' into (row,col) form
        if s_hash == 'Start':
            s_hash = (2,0)
            
        row,col = s_hash # all non-terminal s_hash are (row, col)
        if a_desc == 'U':
            row -= 1
        elif a_desc == 'D':
            row += 1
        elif a_desc == 'R':
            col += 1
        elif a_desc == 'L':
            col -= 1
        # no limit checking done... assume only legal moves are submitted
        sn_hash = s_hash_rowL[row][col]

        reward = rewardD.get(sn_hash, self.step_reward)
        
        return sn_hash, reward
        
    def get_state_legal_action_list(self, s_hash):
        """
        Return a list of possible actions from this state.
        Include any actions thought to be zero probability.
        OR Empty list, if the agent must simply guess.
        """
        
        return actionD.get( s_hash, [] )
        

if __name__ == "__main__": # pragma: no cover
    
    import time
    import os, sys
    from introrl.agent_supt.episode_maker import make_episode
    from introrl.agent_supt.episode_summ_print import epi_summ_print
    from introrl.td_funcs.sarsa_epsilon_greedy import sarsa_epsilon_greedy
    
    
    sim = SampleSimulation( step_reward=-0.04 )
    sim.layout.s_hash_print( none_str='*' )
    
    
    policy, state_value = \
        sarsa_epsilon_greedy( sim, 
                              initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                              read_pickle_file='', 
                              save_pickle_file='',
                              use_list_of_start_states=False, # use list OR single start state of environment.
                              do_summ_print=True, show_last_change=True, fmt_Q='%g', fmt_R='%g',
                              max_num_episodes=500, min_num_episodes=10, max_abserr=0.001, gamma=1.0,
                              iteration_prints=0,
                              max_episode_steps=1000,
                              epsilon=0.1, const_epsilon=True, epsilon_half_life=200,
                              alpha=0.1, const_alpha=True, alpha_half_life=200,
                              N_episodes_wo_decay=0)
    
                          
    episode = make_episode( sim.start_state_hash, policy, sim, 
                            sim.terminal_set, max_steps=20 )
    epi_summ_print(episode, policy, sim, show_rewards=False,
                   show_env_states=True, none_str='*')

    sim.random_transition_prob = 0.0 # so arrows are drawn deterministically on policy diagram
    policy.save_diagram( sim, inp_colorD=None, save_name='sample_sim_policy',
                         show_arrows=True, scale=1.0, h_over_w=0.8, do_show=True)